In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('Final_Dataset.txt', index_col=0)
#stcTV = pd.read_csv('stcTV.csv', index_col=0)
#df.to_csv('df_nocap.csv')

In [3]:
len(df)

3598607

In [4]:
df.head()

Unnamed: 0,date_,user_id_maped,program_name,duration_seconds,program_class,season,episode,program_desc,program_genre,series_title,hd,original_name
1,2017-05-27,26138,100 treets,40,MOVIE,0,0,Drama Movie100 Streets,Drama,0,0,100 treets
3,2017-05-21,7946,Moana,17,MOVIE,0,0,Animation MovieMoana (HD),Animation,0,1,Moana
4,2017-08-10,7418,The Mermaid Princess,8,MOVIE,0,0,Animation MovieThe Mermaid Princess (HD),Animation,0,1,The Mermaid Princess
5,2017-07-26,19307,The Mermaid Princess,76,MOVIE,0,0,Animation MovieThe Mermaid Princess (HD),Animation,0,1,The Mermaid Princess
7,2017-07-07,15860,Churchill,87,MOVIE,0,0,Biography MovieChurchill (HD),Biography,0,1,Churchill


In [5]:
len(df.original_name.unique())

1828

In [6]:
len(df.program_name.unique())

8661

In [7]:
#to format names and keep unique names
def standardize_text(input_text):
    clean_text = re.sub('[^A-Za-z0-9]+', '', str(input_text))
    return clean_text

In [8]:
df['original_name'] = df['original_name'].apply(lambda x: standardize_text(x))
df['program_name'] = df['program_name'].apply(lambda x: standardize_text(x))

In [9]:
len(df.original_name.unique())

1795

In [10]:
len(df.program_name.unique())

8400

In [12]:
#handling duplications by taking sum of durations for same records 
#df = df.groupby(['user_id_maped','series_title','standardize_original_name'], as_index=False)['duration_seconds'].sum()
#df = df.groupby(['user_id_maped','season','standardize_original_name'], as_index=False)['duration_seconds'].sum()
#df = df.groupby(['user_id_maped','episode','standardize_original_name'], as_index=False)['duration_seconds'].sum()
df = df.groupby(['user_id_maped','program_class','program_genre','series_title', 'hd','original_name'], as_index=False)['duration_seconds'].sum()

In [13]:
len(df)

714700

In [14]:
# Handling Outliers by Durations Capping to 1 Hour According to Statista.com Statistics
df['duration_seconds'] = np.where(df.duration_seconds>=(60*60*1), (60*60*1), df.duration_seconds)

In [15]:
# Feature binning will be treated as Ratings. 
# Because we don't have the actual duration of a program, we will use the maximum observed duration. 
df['max_duration'] = df.groupby('original_name')['duration_seconds'].transform(lambda x: x.max())
df['Rating'] = df['duration_seconds']*10/df['max_duration']
df['Rating'] = df['Rating'].apply(lambda x: int(x))

In [None]:
#the following tests to show the importance of the cap where without it the values will tend to be zeros
df.Rating.value_counts() #1 hour

0     589451
10    255270
3     130294
6      96790
7      86361
1      66420
4      48211
2      46507
8      36528
5      31168
9      23287
Name: Rating, dtype: int64

In [None]:
df.Rating.value_counts() #2 hours

0     655063
3     183179
1     177081
10    108253
2      79402
4      59892
5      38600
7      34621
6      32480
8      22348
9      19368
Name: Rating, dtype: int64

In [None]:
df.Rating.value_counts() #3 hours

0     701152
2     219752
1     209961
10     62306
3      61928
4      51249
5      38131
6      26946
7      17633
8      12042
9       9187
Name: Rating, dtype: int64

In [None]:
df.Rating.value_counts() #no caps

0     1369083
1       23298
2        6968
3        3365
4        2095
10       1790
5        1350
6         874
7         603
8         461
9         400
Name: Rating, dtype: int64

In [16]:
#Adjust Undefined Genre according to IMDb 
df.loc[ df['original_name'] == 'Dunkirk', 'program_genre'] = 'Action'
df.loc[ df['original_name'] == 'Friends', 'program_genre'] = 'Comedy'
df.loc[ df['original_name'] == 'Harry', 'program_genre'] = 'Crime'
df.loc[ df['original_name'] == 'SehheeWaaree', 'program_genre'] = 'Others'
df.loc[ df['original_name'] == 'BatmanUnlimitedAnimalInstinctsforViewing', 'program_genre'] = 'Animation'

In [17]:
df.program_genre.value_counts()

Action         179840
Animation      168151
Comedy         108976
Drama          101677
Horror          56726
Thriller        45263
Biography       21069
Family          14555
Documentary     12908
Crime            3691
Sci-Fi           1000
Adventure         402
Romance           325
Others             91
Wrestling          26
Name: program_genre, dtype: int64

In [18]:
user_genres = df.groupby(['user_id_maped','program_genre'], as_index=False)['original_name'].size()
user_genres.rename(columns = {"program_genre":"preferred_program_genre"}, inplace=True)

In [19]:
user_genres['ranks'] = user_genres.groupby('user_id_maped')['size'].rank(ascending=False, method='first')
user_genres = user_genres.loc[user_genres['ranks']==1]
df = df.merge(user_genres[['user_id_maped','preferred_program_genre']])

In [20]:
#If the preferred category of the user is Animation then we assume that the user is not an adult and vice versa
#df['isKid'] = np.where(df.preferred_program_genre=='Animation', 1 ,0) #1 is true

In [21]:
user_counts = df.groupby(['user_id_maped'], as_index=False)['original_name'].count()
user_counts.original_name.mean()
user_counts['isActive'] = np.where(user_counts.original_name>user_counts.original_name.mean(), 1,0)
df = df.merge(user_counts[['user_id_maped','isActive']])


In [22]:
movie_counts = df.groupby(['original_name'], as_index=False)['user_id_maped'].count()
movie_counts.user_id_maped.mean()
movie_counts['isPopular'] = np.where(movie_counts.user_id_maped>movie_counts.user_id_maped.mean(), 1,0)
df = df.merge(movie_counts[['original_name','isPopular']])

In [23]:
df.isPopular.value_counts()

1    549105
0    165595
Name: isPopular, dtype: int64

In [24]:
df.isActive.value_counts()

1    578805
0    135895
Name: isActive, dtype: int64

In [25]:
df.preferred_program_genre.value_counts()

Action         342269
Animation      236134
Comedy          70993
Drama           46079
Horror          16651
Biography        1124
Thriller         1034
Documentary       331
Family             72
Crime               7
Romance             2
Sci-Fi              2
Adventure           1
Wrestling           1
Name: preferred_program_genre, dtype: int64

In [26]:
df.head()

Unnamed: 0,user_id_maped,program_class,program_genre,series_title,hd,original_name,duration_seconds,max_duration,Rating,preferred_program_genre,isActive,isPopular
0,1,MOVIE,Action,0,0,MenInBlack,3600,3600,10,Action,0,1
1,25,MOVIE,Action,0,0,MenInBlack,16,3600,0,Comedy,1,1
2,46,MOVIE,Action,0,0,MenInBlack,3600,3600,10,Animation,1,1
3,161,MOVIE,Action,0,0,MenInBlack,3600,3600,10,Action,1,1
4,175,MOVIE,Action,0,0,MenInBlack,4,3600,0,Action,1,1


In [27]:
df.to_csv('Final_Dataset.txt')