In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.metrics import davies_bouldin_score

## Read Dataset

In [3]:
df = pd.read_csv ('/content/drive/MyDrive/Recommendation/dfn3share.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,user,service,count,subcat,gender
0,3646,92,127,15,1
1,3646,94,18,15,1
2,3646,93,19,15,1
3,3646,95,2,17,1
4,3646,114,1,15,1


## Hybrid Model

In [11]:
def Hybrid(df):
  # Create the matrix
  min_counting=1
  df_top_services = df[df['count']>=min_counting]
  df_counting_with_top_services = df[df['service'].isin(list(df_top_services.index))]

  min_service_rated = 1
  df_users = df_counting_with_top_services[['service','user']].groupby(['user']).agg(['count']).sort_values(('service','count'),ascending=False)
  df_top_rating_users = df_users[df_users[('service', 'count')]>=min_service_rated]
  top_rating_users = list(df_top_rating_users.index)
  df_final=df_counting_with_top_services[df_counting_with_top_services['user'].isin(top_rating_users)]
  
  df_user_item_matrix = df_final.pivot(index='user',columns='service',values='count')
  return df_user_item_matrix


In [12]:
item=Hybrid(df)

In [13]:
item[item.index==3646]

service,0,1,2,3,4,5,6,7,8,9,...,555,560,562,568,580,581,583,585,586,588
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3646,,,,,,,,,,,...,,,,,,,,,,


In [14]:
def similar_service(item):
  service_name = item[92]
  return item.corrwith(service_name).sort_values(ascending=False).head(10)
  

In [15]:
def recom_user(random_user,item):
  # random_user=3646
  random_user_df = item[item.index==random_user]
  # Let's choose non-NaN. service used by all 3646:
  service_used = random_user_df.columns[random_user_df.notna().any()].tolist()
  # we have reduced the dataset based on services watched by user 3646:
  service_used_df = item[service_used]
  # information on how many services each user used in total:
  user_service_count = service_used_df.T.notnull().sum()

  user_service_count = user_service_count.reset_index()
  user_service_count.columns = ["user","service_count"]
  # user_service_count.head().sort_values(by='service_count',ascending=False)
  # 60% of services used by 3646:
  perc = len(service_used) * 60 / 100
  # People who have use more than 60% service together with 3646 users:
  users_same_service = user_service_count[user_service_count["service_count"] > perc]["user"]
  # Let's combine the data of user #3646 and similar users:
  final_df = pd.concat([service_used_df[service_used_df.index.isin(users_same_service)],
                      random_user_df[service_used]])
  #corr for all users:
  corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
  corr_df = pd.DataFrame(corr_df, columns=["corr"])
  corr_df.index.names = ['user_id_1', 'user_id_2']
  corr_df = corr_df.reset_index()
  # Users with a correlation of %65 or more with 3646 users:
  top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.3)][
    ["user_id_2", "corr"]].reset_index(drop=True)

  top_users = top_users.sort_values(by='corr', ascending=False)
  top_users.rename(columns={"user_id_2": "user"}, inplace=True)

  top_users_ratings = top_users.merge(df[["user", "service", "count"]], how='inner')

  top_users_ratings = top_users_ratings[top_users_ratings["user"] != random_user]

  # Calculate the Weighted Average Recommendation Score and keep the first 5 services.

  #Let's do a single score with the most similar by corr * rating:
  top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['count']
  top_users_ratings.groupby('service').agg({"weighted_rating": "mean"})

  recommendation_df = top_users_ratings.groupby('service').agg({"weighted_rating": "mean"})
  recommendation_df = recommendation_df.reset_index()
  recommendation_df.head().sort_values("weighted_rating", ascending=False)
  service_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3].sort_values("weighted_rating", ascending=False)
  return service_to_be_recommend['service'].to_list()



In [None]:
recom_user(3646	,item)

[92, 337, 508, 393, 175, 171, 75, 52, 20, 17]

In [None]:
def Recommender_System(userid):
    item=Hybrid(df)
    #use set to remove repeated items
    Rec_Rnn=set(recom_user(userid,item))
    #load the dataset without time for matrix correlations recommender system
    df3= pd.read_csv ('/content/drive/MyDrive/Recommendation/dfn3share.csv')
    df3 = df3.loc[:, ~df3.columns.str.contains('^Unnamed')]
    #Create the matrix
    user = df3.groupby('user')['service'].count()
    # can filter the number of services which is used by each user
    subcat_more20 = user[user >1].index.to_list()
    df_subcat = df3[df3['user'].isin(subcat_more20)].reset_index(drop=True)
    # create a matrix from pivot tabel
    matrix1 = df_subcat.pivot_table(index=['user'], columns='service', values='count').fillna(0)

    merged_filename = f"merged_output{userid}.csv"
    with open(merged_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['correlations', 'service'])  # Write header row
        
        for i in Rec_Rnn:
            Subcat = matrix1[i]
            Undo_music1 = pd.DataFrame(matrix1.corrwith(Subcat), columns=['correlations']).sort_values(by=['correlations'], ascending=False).head(5)
            Undo_music1['service'] = Undo_music1.index  # Add a new column "service" with the index values
            for row in Undo_music1.iterrows():
                writer.writerow(row[1])  # Write each row directly to the CSV file

    # Load merged dataset
    merged_df = pd.read_csv(merged_filename)

    # Find duplicated rows
    duplicated_rows = merged_df[merged_df.duplicated(subset=['service'])]

    # Save the duplicated rows to CSV
    drop_filename = f"/content/drive/MyDrive/Recommendation/Recommend{userid}.csv"
    duplicated_rows.to_csv(drop_filename, index=False)

    return duplicated_rows


In [32]:
Recommender_System(3646)

[    correlations  service
 22      0.053578      172
 27      0.151900      164
 28      0.118605      341
 29      0.118145      509
 32      0.115936       19
 34      0.109399      171
 37      0.142000      139
 38      0.114109      171
 39      0.104647       19
 40      1.000000       20
 41      0.164137      139
 44      0.129542      171]

In [None]:
# Generator for having a loop for all users
# import csv
# def generate_output():
#     count=0
#     # Iterate over all users in the dataset
#     all_users = df['user'].unique()[:100]
#     print(df['user'].nunique())
#     for user in all_users:
#         item=Hybrid(df)
#         user_output = recom_user(user,item)
#         print(user)
#         print(user_output)
#         filename = f'user_{user}.csv'
#         count+=1
#         print(count)
#         with open(filename, 'w', newline='') as csvfile:
#             writer = csv.writer(csvfile)
#             writer.writerow(['User ID', 'Service'])
#             for item in user_output:
#                 writer.writerow([user, item])
#         yield user, user_output

# # Write the combined output to a single CSV file using a generator expression
# combined_filename = '/content/drive/MyDrive/Recommendation/Recommend_final_Hybrid_1000.csv'
# with open(combined_filename, 'w', newline='') as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(['User ID', 'Service'])
#     for user, user_output in generate_output():
#         for item in user_output:
#             writer.writerow([user, item])

426026
3646
[92, 337, 508, 393, 175, 171, 75, 52, 20, 17]
1
690214
[]
2
78178
[92, 133, 57, 15, 42]
3
469578
[92, 20, 139, 23]
4
433936
[92, 339, 80, 0, 30, 20]
5
134337
[]
6
28185
[92, 94, 58, 0, 98, 139, 20, 197, 130, 164]
7
1027430
[92, 328, 4, 100, 29, 30, 32, 98, 27, 97, 332, 68, 64, 75]
8
414837
[]
9
324625
[]
10
960157
[92, 222, 130]
11
776611
[92, 509, 147, 175]
12
736685
[92]
13
41620
[92, 139]
14
80467
[]
15
72828
[]
16
30168
[92, 20, 54]
17
647279
[]
18
1
[]
19
32919
[271, 92, 475, 509]
20
143286
[92, 333, 175, 171, 126, 20, 77, 83, 98, 97, 84, 504, 147, 42, 17, 328]
21
774012
[92, 509, 96]
22
351658
[]
23


KeyboardInterrupt: ignored

In [None]:
import pandas as pd
import csv

def Hybrid(df):
    min_counting = 1
    df_top_services = df[df['count'] >= min_counting]
    df_counting_with_top_services = df[df['service'].isin(list(df_top_services.index))]

    min_service_rated = 1
    df_users = df_counting_with_top_services[['service', 'user']].groupby(['user']).agg(['count']).sort_values(('service', 'count'), ascending=False)
    df_top_rating_users = df_users[df_users[('service', 'count')] >= min_service_rated]
    top_rating_users = list(df_top_rating_users.index)
    df_final = df_counting_with_top_services[df_counting_with_top_services['user'].isin(top_rating_users)]

    df_user_item_matrix = df_final.pivot(index='user', columns='service', values='count')
    return df_user_item_matrix

def recom_user(random_user, item):
    random_user_df = item[item.index == random_user]
    service_used = random_user_df.columns[random_user_df.notna().any()].tolist()
    service_used_df = item[service_used]
    user_service_count = service_used_df.T.notnull().sum()

    user_service_count = user_service_count.reset_index()
    user_service_count.columns = ["user", "service_count"]
    perc = len(service_used) * 60 / 100
    users_same_service = user_service_count[user_service_count["service_count"] > perc]["user"]
    final_df = pd.concat([service_used_df[service_used_df.index.isin(users_same_service)], random_user_df[service_used]])
    corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
    corr_df = pd.DataFrame(corr_df, columns=["corr"])
    corr_df.index.names = ['user_id_1', 'user_id_2']
    corr_df = corr_df.reset_index()
    top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][["user_id_2", "corr"]].reset_index(drop=True)
    top_users = top_users.sort_values(by='corr', ascending=False)
    top_users.rename(columns={"user_id_2": "user"}, inplace=True)
    top_users_ratings = top_users.merge(df[["user", "service", "count"]], how='inner')
    top_users_ratings = top_users_ratings[top_users_ratings["user"] != random_user]
    top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['count']
    recommendation_df = top_users_ratings.groupby('service').agg({"weighted_rating": "mean"})
    recommendation_df = recommendation_df.reset_index()
    service_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.7].sort_values("weighted_rating", ascending=False)
    return service_to_be_recommend['service'].tolist()

def generate_output(df):
    item = Hybrid(df)
    all_users = df['user'].unique()[:1000]
    for user in all_users:
        user_output = recom_user(user, item)
        filename = f'user_{user}.csv'
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['User ID', 'Service'])
            for item in user_output:
                writer.writerow([user, item])
        yield user, user_output

df = pd.read_csv('/content/drive/MyDrive/Recommendation/dfn3share.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

combined_filename = '/content/drive/MyDrive/Recommendation/Recommend_RNN_final_Hybrid_1000.csv'
with open(combined_filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['User ID', 'Service'])
    for user, user_output in generate_output(df):
        for item in user_output:
            writer.writerow([user, item])


AttributeError: ignored