## User relevant data has been selected

### now, the task is to arrange it in a format where 
### -user data is present only in one csv file : User Content 
### -user values are one hot encoded 

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [9]:
user_data_path = 'user_demographics.csv'
user_ratings_path = 'user_to_anime.csv'

user_data = pd.read_csv(user_data_path)
user_ratings = pd.read_csv(user_ratings_path)

In [10]:
print(user_ratings.columns)
user_ratings.head(10)


Index(['user_id', 'Username', 'anime_id', 'Anime Title', 'rating'], dtype='object')


Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8
5,1,Xinil,306,Abenobashi Mahou☆Shoutengai,8
6,1,Xinil,53,Ai Yori Aoshi,7
7,1,Xinil,47,Akira,5
8,1,Xinil,591,Amaenaide yo!!,6
9,1,Xinil,54,Appleseed (Movie),7


In [11]:
user_data.head(10)

Unnamed: 0,Mal ID,Username,Gender,Mean Score,Completed,Birth_Year
0,1,Xinil,Male,7.37,233.0,1985
1,20,vondur,Male,8.06,94.0,1988
2,66,Hiromi,Male,7.53,148.0,1990
3,82,Achtor,Male,7.17,153.0,1989
4,112,luffykun,Male,8.77,125.0,1983
5,120,hazte,Male,6.69,260.0,1986
6,138,DanskiJonez,Male,9.09,188.0,1986
7,163,Holzy,Male,7.62,1478.0,1982
8,185,Fador,Male,7.58,167.0,1984
9,208,Arinohyoshi,Male,8.3,194.0,1987


In [12]:
merged_df = user_data.merge(
    user_ratings,
    left_on=['Mal ID', 'Username'],
    right_on=['user_id', 'Username'],
    how='inner'
)
merged_df = merged_df.drop(columns=['Mal ID'])
print(merged_df.head(10))
print(merged_df.shape)

  Username Gender  Mean Score  Completed  Birth_Year  user_id  anime_id  \
0    Xinil   Male        7.37      233.0        1985        1        21   
1    Xinil   Male        7.37      233.0        1985        1        48   
2    Xinil   Male        7.37      233.0        1985        1       320   
3    Xinil   Male        7.37      233.0        1985        1        49   
4    Xinil   Male        7.37      233.0        1985        1       304   
5    Xinil   Male        7.37      233.0        1985        1       306   
6    Xinil   Male        7.37      233.0        1985        1        53   
7    Xinil   Male        7.37      233.0        1985        1        47   
8    Xinil   Male        7.37      233.0        1985        1       591   
9    Xinil   Male        7.37      233.0        1985        1        54   

                   Anime Title  rating  
0                    One Piece       9  
1                  .hack//Sign       7  
2                       A Kite       5  
3         

## Now, lets prepare this data for useable for NN


In [13]:
merged_df.isna().sum()
merged_df.head()

Unnamed: 0,Username,Gender,Mean Score,Completed,Birth_Year,user_id,anime_id,Anime Title,rating
0,Xinil,Male,7.37,233.0,1985,1,21,One Piece,9
1,Xinil,Male,7.37,233.0,1985,1,48,.hack//Sign,7
2,Xinil,Male,7.37,233.0,1985,1,320,A Kite,5
3,Xinil,Male,7.37,233.0,1985,1,49,Aa! Megami-sama!,8
4,Xinil,Male,7.37,233.0,1985,1,304,Aa! Megami-sama! Movie,8


In [14]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467079 entries, 0 to 8467078
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   Username     object 
 1   Gender       object 
 2   Mean Score   float64
 3   Completed    float64
 4   Birth_Year   int64  
 5   user_id      int64  
 6   anime_id     int64  
 7   Anime Title  object 
 8   rating       int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 581.4+ MB


In [15]:
# exclude userid and animeid 
#exclude anime title, username 

# keep mean score, completed year 

In [16]:

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse

# Apply one-hot encoding to the 'Gender' column
gender_encoded = encoder.fit_transform(merged_df[['Gender']])

# Convert the encoded array to a DataFrame
gender_encoded_df = pd.DataFrame(gender_encoded, columns=encoder.get_feature_names_out(['Gender']))

# Concatenate the one-hot encoded columns back to the original DataFrame
merged_df = pd.concat([merged_df.reset_index(drop=True), gender_encoded_df], axis=1)

# Drop the original 'Gender' column
merged_df = merged_df.drop(columns=['Gender'])

# Display the updated DataFrame
print(merged_df.head())

  Username  Mean Score  Completed  Birth_Year  user_id  anime_id  \
0    Xinil        7.37      233.0        1985        1        21   
1    Xinil        7.37      233.0        1985        1        48   
2    Xinil        7.37      233.0        1985        1       320   
3    Xinil        7.37      233.0        1985        1        49   
4    Xinil        7.37      233.0        1985        1       304   

              Anime Title  rating  Gender_Female  Gender_Male  \
0               One Piece       9            0.0          1.0   
1             .hack//Sign       7            0.0          1.0   
2                  A Kite       5            0.0          1.0   
3        Aa! Megami-sama!       8            0.0          1.0   
4  Aa! Megami-sama! Movie       8            0.0          1.0   

   Gender_Non-Binary  
0                0.0  
1                0.0  
2                0.0  
3                0.0  
4                0.0  


In [17]:
# Initialize the MinMaxScaler for Completed and Birth_Year (default range [0, 1])
scaler_default = MinMaxScaler()

# Initialize the MinMaxScaler for rating (custom range [-1, 1])
scaler_rating = MinMaxScaler(feature_range=(-1, 1))

# Select the columns to scale
columns_to_scale_default = ['Completed', 'Birth_Year']
column_to_scale_rating = ['rating','Mean Score']

# Apply Min-Max Scaling to Completed and Birth_Year
merged_df[columns_to_scale_default] = scaler_default.fit_transform(merged_df[columns_to_scale_default])

# Apply Min-Max Scaling to rating with range [-1, 1]
merged_df[column_to_scale_rating] = scaler_rating.fit_transform(merged_df[column_to_scale_rating])

# Display the updated DataFrame
print(merged_df.sample(10))

              Username  Mean Score  Completed  Birth_Year  user_id  anime_id  \
4185285       Rorororo    0.508889   0.087019    0.948547   357938     15323   
7751621     GoldenRose    0.257778   0.003566    0.947118  1117117      2966   
7620663       Toffee86    0.457778   0.023776    0.945688  1097543     27831   
257752           calla    0.108889   0.019496    0.945688     7622      3230   
6203115   owlskywalker    0.526667   0.029838    0.946165   493705       165   
7563629       wufang90    0.431111   0.019853    0.947594  1088067     11433   
7763502     NoranekoLK    0.377778   0.038398    0.949976  1119071      7724   
3295449  Wirglyus_Kujo    0.282222   0.031978    0.950929   304383     38234   
6392663      TubbleKun    0.593333   0.008084    0.949976   505620      3457   
5728778        Clarian    0.368889   0.090466    0.951405   465273     40938   

                                               Anime Title    rating  \
4185285  One Piece: Episode of Nami - Koukaishi

In [18]:
merged_df.sample(10)
merged_df.info()

# NN me kya nhi chahiye
# Username           object 
# user_id            int64
# anime_id           int64
# anime_title        object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467079 entries, 0 to 8467078
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Username           object 
 1   Mean Score         float64
 2   Completed          float64
 3   Birth_Year         float64
 4   user_id            int64  
 5   anime_id           int64  
 6   Anime Title        object 
 7   rating             float64
 8   Gender_Female      float64
 9   Gender_Male        float64
 10  Gender_Non-Binary  float64
dtypes: float64(7), int64(2), object(2)
memory usage: 710.6+ MB


In [19]:
merged_df.head(10)

Unnamed: 0,Username,Mean Score,Completed,Birth_Year,user_id,anime_id,Anime Title,rating,Gender_Female,Gender_Male,Gender_Non-Binary
0,Xinil,0.415556,0.027699,0.945212,1,21,One Piece,0.777778,0.0,1.0,0.0
1,Xinil,0.415556,0.027699,0.945212,1,48,.hack//Sign,0.333333,0.0,1.0,0.0
2,Xinil,0.415556,0.027699,0.945212,1,320,A Kite,-0.111111,0.0,1.0,0.0
3,Xinil,0.415556,0.027699,0.945212,1,49,Aa! Megami-sama!,0.555556,0.0,1.0,0.0
4,Xinil,0.415556,0.027699,0.945212,1,304,Aa! Megami-sama! Movie,0.555556,0.0,1.0,0.0
5,Xinil,0.415556,0.027699,0.945212,1,306,Abenobashi Mahou☆Shoutengai,0.555556,0.0,1.0,0.0
6,Xinil,0.415556,0.027699,0.945212,1,53,Ai Yori Aoshi,0.333333,0.0,1.0,0.0
7,Xinil,0.415556,0.027699,0.945212,1,47,Akira,-0.111111,0.0,1.0,0.0
8,Xinil,0.415556,0.027699,0.945212,1,591,Amaenaide yo!!,0.111111,0.0,1.0,0.0
9,Xinil,0.415556,0.027699,0.945212,1,54,Appleseed (Movie),0.333333,0.0,1.0,0.0
