## User relevant data has been selected

### now, the task is to arrange it in a format where 
### -user data is present only in one csv file : User Content 
### -user values are one hot encoded 

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
user_data_path = 'user_demographics.csv'
user_ratings_path = 'user_to_anime.csv'

user_data = pd.read_csv(user_data_path)
user_ratings = pd.read_csv(user_ratings_path)

In [3]:
print(user_ratings.columns)
user_ratings.head(10)


Index(['user_id', 'Username', 'anime_id', 'Anime Title', 'rating'], dtype='object')


Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8
5,1,Xinil,306,Abenobashi Mahou☆Shoutengai,8
6,1,Xinil,53,Ai Yori Aoshi,7
7,1,Xinil,47,Akira,5
8,1,Xinil,591,Amaenaide yo!!,6
9,1,Xinil,54,Appleseed (Movie),7


In [4]:
user_data.head(10)

Unnamed: 0,Mal ID,Username,Gender,Mean Score,Completed,Birth_Year
0,1,Xinil,Male,7.37,233.0,1985
1,20,vondur,Male,8.06,94.0,1988
2,66,Hiromi,Male,7.53,148.0,1990
3,82,Achtor,Male,7.17,153.0,1989
4,112,luffykun,Male,8.77,125.0,1983
5,120,hazte,Male,6.69,260.0,1986
6,138,DanskiJonez,Male,9.09,188.0,1986
7,163,Holzy,Male,7.62,1478.0,1982
8,185,Fador,Male,7.58,167.0,1984
9,208,Arinohyoshi,Male,8.3,194.0,1987


In [5]:
merged_df = user_data.merge(
    user_ratings,
    left_on=['Mal ID', 'Username'],
    right_on=['user_id', 'Username'],
    how='inner'
)
merged_df = merged_df.drop(columns=['Mal ID'])
print(merged_df.head(10))
print(merged_df.shape)

  Username Gender  Mean Score  Completed  Birth_Year  user_id  anime_id  \
0    Xinil   Male        7.37      233.0        1985        1        21   
1    Xinil   Male        7.37      233.0        1985        1        48   
2    Xinil   Male        7.37      233.0        1985        1       320   
3    Xinil   Male        7.37      233.0        1985        1        49   
4    Xinil   Male        7.37      233.0        1985        1       304   
5    Xinil   Male        7.37      233.0        1985        1       306   
6    Xinil   Male        7.37      233.0        1985        1        53   
7    Xinil   Male        7.37      233.0        1985        1        47   
8    Xinil   Male        7.37      233.0        1985        1       591   
9    Xinil   Male        7.37      233.0        1985        1        54   

                   Anime Title  rating  
0                    One Piece       9  
1                  .hack//Sign       7  
2                       A Kite       5  
3         

In [6]:
merged_df.to_csv("user_data_temp_final.csv")

KeyboardInterrupt: 

## Now, lets prepare this data for useable for NN


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467079 entries, 0 to 8467078
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   Username     object 
 1   Gender       object 
 2   Mean Score   float64
 3   Completed    float64
 4   Birth_Year   int64  
 5   user_id      int64  
 6   anime_id     int64  
 7   Anime Title  object 
 8   rating       int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 581.4+ MB


In [None]:
# exclude userid and animeid 
#exclude anime title, username 

# keep mean score, completed year 

In [None]:

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse

# Apply one-hot encoding to the 'Gender' column
gender_encoded = encoder.fit_transform(merged_df[['Gender']])

# Convert the encoded array to a DataFrame
gender_encoded_df = pd.DataFrame(gender_encoded, columns=encoder.get_feature_names_out(['Gender']))

# Concatenate the one-hot encoded columns back to the original DataFrame
merged_df = pd.concat([merged_df.reset_index(drop=True), gender_encoded_df], axis=1)

# Drop the original 'Gender' column
merged_df = merged_df.drop(columns=['Gender'])

# Display the updated DataFrame
print(merged_df.head())

  Username  Mean Score  Completed  Birth_Year  user_id  anime_id  \
0    Xinil        7.37      233.0        1985        1        21   
1    Xinil        7.37      233.0        1985        1        48   
2    Xinil        7.37      233.0        1985        1       320   
3    Xinil        7.37      233.0        1985        1        49   
4    Xinil        7.37      233.0        1985        1       304   

              Anime Title  rating  Gender_Female  Gender_Male  \
0               One Piece       9            0.0          1.0   
1             .hack//Sign       7            0.0          1.0   
2                  A Kite       5            0.0          1.0   
3        Aa! Megami-sama!       8            0.0          1.0   
4  Aa! Megami-sama! Movie       8            0.0          1.0   

   Gender_Non-Binary  
0                0.0  
1                0.0  
2                0.0  
3                0.0  
4                0.0  


In [None]:
# Initialize the MinMaxScaler for Completed and Birth_Year (default range [0, 1])
scaler_default = MinMaxScaler()

# Initialize the MinMaxScaler for rating (custom range [-1, 1])
scaler_rating = MinMaxScaler(feature_range=(-1, 1))

# Select the columns to scale
columns_to_scale_default = ['Completed', 'Birth_Year']
column_to_scale_rating = ['rating','Mean Score']

# Apply Min-Max Scaling to Completed and Birth_Year
merged_df[columns_to_scale_default] = scaler_default.fit_transform(merged_df[columns_to_scale_default])

# Apply Min-Max Scaling to rating with range [-1, 1]
merged_df[column_to_scale_rating] = scaler_rating.fit_transform(merged_df[column_to_scale_rating])

# Display the updated DataFrame
print(merged_df.sample(10))

              Username  Mean Score  Completed  Birth_Year  user_id  anime_id  \
4185285       Rorororo    0.508889   0.087019    0.948547   357938     15323   
7751621     GoldenRose    0.257778   0.003566    0.947118  1117117      2966   
7620663       Toffee86    0.457778   0.023776    0.945688  1097543     27831   
257752           calla    0.108889   0.019496    0.945688     7622      3230   
6203115   owlskywalker    0.526667   0.029838    0.946165   493705       165   
7563629       wufang90    0.431111   0.019853    0.947594  1088067     11433   
7763502     NoranekoLK    0.377778   0.038398    0.949976  1119071      7724   
3295449  Wirglyus_Kujo    0.282222   0.031978    0.950929   304383     38234   
6392663      TubbleKun    0.593333   0.008084    0.949976   505620      3457   
5728778        Clarian    0.368889   0.090466    0.951405   465273     40938   

                                               Anime Title    rating  \
4185285  One Piece: Episode of Nami - Koukaishi

In [None]:
merged_df.sample(10)
merged_df.info()

# NN me kya nhi chahiye
# Username           object 
# user_id            int64
# anime_id           int64
# anime_title        object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467079 entries, 0 to 8467078
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Username           object 
 1   Mean Score         float64
 2   Completed          float64
 3   Birth_Year         float64
 4   user_id            int64  
 5   anime_id           int64  
 6   Anime Title        object 
 7   rating             float64
 8   Gender_Female      float64
 9   Gender_Male        float64
 10  Gender_Non-Binary  float64
dtypes: float64(7), int64(2), object(2)
memory usage: 710.6+ MB


In [None]:
merged_df.head(10)

Unnamed: 0,Username,Mean Score,Completed,Birth_Year,user_id,anime_id,Anime Title,rating,Gender_Female,Gender_Male,Gender_Non-Binary
0,Xinil,0.415556,0.027699,0.945212,1,21,One Piece,0.777778,0.0,1.0,0.0
1,Xinil,0.415556,0.027699,0.945212,1,48,.hack//Sign,0.333333,0.0,1.0,0.0
2,Xinil,0.415556,0.027699,0.945212,1,320,A Kite,-0.111111,0.0,1.0,0.0
3,Xinil,0.415556,0.027699,0.945212,1,49,Aa! Megami-sama!,0.555556,0.0,1.0,0.0
4,Xinil,0.415556,0.027699,0.945212,1,304,Aa! Megami-sama! Movie,0.555556,0.0,1.0,0.0
5,Xinil,0.415556,0.027699,0.945212,1,306,Abenobashi Mahou☆Shoutengai,0.555556,0.0,1.0,0.0
6,Xinil,0.415556,0.027699,0.945212,1,53,Ai Yori Aoshi,0.333333,0.0,1.0,0.0
7,Xinil,0.415556,0.027699,0.945212,1,47,Akira,-0.111111,0.0,1.0,0.0
8,Xinil,0.415556,0.027699,0.945212,1,591,Amaenaide yo!!,0.111111,0.0,1.0,0.0
9,Xinil,0.415556,0.027699,0.945212,1,54,Appleseed (Movie),0.333333,0.0,1.0,0.0


## try after model 2 

In [14]:
new_df = pd.read_csv("User_input.csv")
isany = [column for column in new_df.isna().sum() == 0]
isany

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [9]:
new_df.head()

Unnamed: 0,Username,Gender,Mean Score,Completed,Birth_Year,user_id,anime_id,Anime Title,rating,Genres,...,Genre_Gourmet,Genre_Hentai,Genre_Horror,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Slice of Life,Genre_Sports,Genre_Supernatural,Genre_Suspense
0,Xinil,Male,7.37,233.0,1985,1,21,One Piece,9,"Action, Adventure, Fantasy",...,0,0,0,0,0,0,0,0,0,0
1,Xinil,Male,7.37,233.0,1985,1,48,.hack//Sign,7,"Adventure, Fantasy, Mystery",...,0,0,0,1,0,0,0,0,0,0
2,Xinil,Male,7.37,233.0,1985,1,320,A Kite,5,"Action, Drama, Hentai",...,0,1,0,0,0,0,0,0,0,0
3,Xinil,Male,7.37,233.0,1985,1,49,Aa! Megami-sama!,8,"Comedy, Romance, Supernatural",...,0,0,0,0,1,0,0,0,1,0
4,Xinil,Male,7.37,233.0,1985,1,304,Aa! Megami-sama! Movie,8,"Comedy, Romance, Supernatural",...,0,0,0,0,1,0,0,0,1,0


In [17]:
print(new_df.columns)
new_df.info()


Index(['Username', 'Gender', 'Mean Score', 'Completed', 'Birth_Year',
       'user_id', 'anime_id', 'Anime Title', 'rating', 'Genres',
       'Genre_Action', 'Genre_Adventure', 'Genre_Avant Garde',
       'Genre_Award Winning', 'Genre_Boys Love', 'Genre_Comedy', 'Genre_Drama',
       'Genre_Ecchi', 'Genre_Erotica', 'Genre_Fantasy', 'Genre_Girls Love',
       'Genre_Gourmet', 'Genre_Hentai', 'Genre_Horror', 'Genre_Mystery',
       'Genre_Romance', 'Genre_Sci-Fi', 'Genre_Slice of Life', 'Genre_Sports',
       'Genre_Supernatural', 'Genre_Suspense'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8466877 entries, 0 to 8466876
Data columns (total 31 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Username             object 
 1   Gender               object 
 2   Mean Score           float64
 3   Completed            float64
 4   Birth_Year           int64  
 5   user_id              int64  
 6   anime_id             int64  
 7   

In [20]:
from collections import defaultdict

# 1. Identify genre columns
genre_columns = [col for col in new_df.columns if col.startswith('Genre_')]

# 2. Group by user_id and sum genre counts
user_genre_counts = new_df.groupby('user_id')[genre_columns].sum()

# 3. Convert to nested dict (only non-zero genres)
user_counts = {
    user_id: row[row > 0].to_dict()
    for user_id, row in user_genre_counts.iterrows()
}



In [21]:
user_counts

{1: {'Genre_Action': 105,
  'Genre_Adventure': 61,
  'Genre_Avant Garde': 5,
  'Genre_Award Winning': 34,
  'Genre_Comedy': 106,
  'Genre_Drama': 89,
  'Genre_Ecchi': 36,
  'Genre_Fantasy': 54,
  'Genre_Girls Love': 1,
  'Genre_Gourmet': 1,
  'Genre_Hentai': 12,
  'Genre_Horror': 16,
  'Genre_Mystery': 22,
  'Genre_Romance': 101,
  'Genre_Sci-Fi': 71,
  'Genre_Slice of Life': 13,
  'Genre_Sports': 18,
  'Genre_Supernatural': 35,
  'Genre_Suspense': 11},
 20: {'Genre_Action': 61,
  'Genre_Adventure': 46,
  'Genre_Avant Garde': 1,
  'Genre_Award Winning': 21,
  'Genre_Comedy': 22,
  'Genre_Drama': 26,
  'Genre_Ecchi': 3,
  'Genre_Fantasy': 41,
  'Genre_Gourmet': 1,
  'Genre_Horror': 6,
  'Genre_Mystery': 11,
  'Genre_Romance': 14,
  'Genre_Sci-Fi': 21,
  'Genre_Slice of Life': 3,
  'Genre_Sports': 8,
  'Genre_Supernatural': 14,
  'Genre_Suspense': 9},
 66: {'Genre_Action': 68,
  'Genre_Adventure': 55,
  'Genre_Avant Garde': 1,
  'Genre_Award Winning': 20,
  'Genre_Comedy': 90,
  'Genre_D

In [22]:
# 1. Get genre columns
genre_columns = [col for col in new_df.columns if col.startswith('Genre_')]

# 2. Multiply each genre column by the anime's rating
weighted_genre_df = new_df.copy()
for genre in genre_columns:
    weighted_genre_df[genre] = weighted_genre_df[genre] * weighted_genre_df['rating']

# 3. Group by user_id and sum the weighted genres
user_genre_rating_sum = weighted_genre_df.groupby('user_id')[genre_columns].sum()

# 4. Convert to nested dictionary (only non-zero sums)
user_rating_sums = {
    user_id: row[row > 0].to_dict()
    for user_id, row in user_genre_rating_sum.iterrows()
}

In [23]:
user_rating_sums

{1: {'Genre_Action': 806,
  'Genre_Adventure': 482,
  'Genre_Avant Garde': 40,
  'Genre_Award Winning': 278,
  'Genre_Comedy': 744,
  'Genre_Drama': 683,
  'Genre_Ecchi': 234,
  'Genre_Fantasy': 407,
  'Genre_Girls Love': 3,
  'Genre_Gourmet': 8,
  'Genre_Hentai': 65,
  'Genre_Horror': 117,
  'Genre_Mystery': 173,
  'Genre_Romance': 734,
  'Genre_Sci-Fi': 523,
  'Genre_Slice of Life': 98,
  'Genre_Sports': 136,
  'Genre_Supernatural': 268,
  'Genre_Suspense': 90},
 20: {'Genre_Action': 489,
  'Genre_Adventure': 379,
  'Genre_Avant Garde': 9,
  'Genre_Award Winning': 177,
  'Genre_Comedy': 171,
  'Genre_Drama': 209,
  'Genre_Ecchi': 17,
  'Genre_Fantasy': 336,
  'Genre_Gourmet': 8,
  'Genre_Horror': 46,
  'Genre_Mystery': 92,
  'Genre_Romance': 112,
  'Genre_Sci-Fi': 172,
  'Genre_Slice of Life': 25,
  'Genre_Sports': 67,
  'Genre_Supernatural': 118,
  'Genre_Suspense': 77},
 66: {'Genre_Action': 509,
  'Genre_Adventure': 402,
  'Genre_Avant Garde': 7,
  'Genre_Award Winning': 158,
  'G

In [24]:
updated_user_avg_rating = {}

for user_id in user_counts:
    avg_genre_ratings = {}
    for genre in user_counts[user_id]:
        count = user_counts[user_id].get(genre, 0)
        total = user_rating_sums.get(user_id, {}).get(genre, 0)
        if count > 0:
            avg_genre_ratings[genre] = total / count
    updated_user_avg_rating[user_id] = avg_genre_ratings


In [26]:
updated_user_avg_rating[20]

{'Genre_Action': 8.01639344262295,
 'Genre_Adventure': 8.23913043478261,
 'Genre_Avant Garde': 9.0,
 'Genre_Award Winning': 8.428571428571429,
 'Genre_Comedy': 7.7727272727272725,
 'Genre_Drama': 8.038461538461538,
 'Genre_Ecchi': 5.666666666666667,
 'Genre_Fantasy': 8.195121951219512,
 'Genre_Gourmet': 8.0,
 'Genre_Horror': 7.666666666666667,
 'Genre_Mystery': 8.363636363636363,
 'Genre_Romance': 8.0,
 'Genre_Sci-Fi': 8.19047619047619,
 'Genre_Slice of Life': 8.333333333333334,
 'Genre_Sports': 8.375,
 'Genre_Supernatural': 8.428571428571429,
 'Genre_Suspense': 8.555555555555555}

## so its verified that Final_user_dataset is doing the averaging part correctly, lets see where can we improve now 