# Data Cleaning & Fixing

Import Libraries and read the csv file into a DataFrame, then show the DataFrame.

In [258]:
import pandas as pd
import warnings

In [259]:
warnings.filterwarnings('ignore')

In [260]:
df=pd.read_csv('mergeGenreImdbFinal.csv',header=0,sep=',')
df

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022?,40,"['John Cena', 'Danielle Brooks', 'Freddie Stro...",335,21,22000,8.4,13146488
1,Encanto,Jared Bush,Animation,2021,102,"['Stephanie Beatriz', 'Mar?a Cecilia Botero', ...",827,192,113000,7.3,2953050
2,After Life,Ricky Gervais,Comedy,2019?2022,30,"['Ricky Gervais', 'Tom Basden', 'Tony Way', 'D...",2000,70,112000,8.5,8398600
3,Don't Look Up,Adam McKay,Comedy,2021,18,"['Leonardo DiCaprio', 'Jennifer Lawrence', 'Me...",4100,260,388000,7.3,11286314
4,How I Met Your Father,Isaac Aptaker,Comedy,2022???,24,"['Hilary Duff', 'Christopher Lowell', 'Francia...",301,1,-1,5.1,14500082
...,...,...,...,...,...,...,...,...,...,...,...
15828,Bratz,Sean McNamara,Comedy,2007,110,"['Skyler Shaye', 'Janel Parrish', 'Logan Brown...",128,59,23000,3.2,804452
15829,,Dee Rees,Crime,2020,115,"['Anne Hathaway', 'Ben Affleck', 'Rosie Perez'...",422,54,15000,4.3,7456312
15830,Meet the Spartans,Jason Friedberg,Comedy,2008,87,"['Sean Maguire', 'Kevin Sorbo', 'Carmen Electr...",448,111,107000,2.8,1073498
15831,Supergirl,Jeannot Szwarc,Action,1984,4,"['Helen Slater', 'Faye Dunaway', ""Peter O'Tool...",186,99,22000,4.4,88206


Function that clean the data by dropping the rows with Nan values & dropping duplicates

In [249]:
def data_cleaning(df):
    #all the cells with value of -1 should replace to Nan  
    df.Views[df.Views == -1] = np.nan
    df.UserReviews[df.UserReviews == -1] = np.nan
    df.CriticReviews[df.CriticReviews == -1] = np.nan
    df.ChapterLengthInMiniutes[df.ChapterLengthInMiniutes== 0] = np.nan
    #drop the rows with Nan value of those columns
    df= df.dropna(subset = ['UserReviews','CriticReviews','Views','ChapterLengthInMiniutes'])
    #drop duplicates if the whole row is same
    df =df.drop_duplicates()
    return df

Function that fix the data by changing it to the real value type and getting rid of unnecessary signs

In [250]:
def fix_data_value_cells(df):
    df['YearOfPublished']=[str(i)[:4] for i in df['YearOfPublished']]
    df['UserReviews'] = df['UserReviews'].astype(int)
    df['CriticReviews'] = df['CriticReviews'].astype(int)
    df['ChapterLengthInMiniutes'] = df['ChapterLengthInMiniutes'].astype(int)
    df['Views']=df['Views'].astype(int)
    return df

In [251]:
def save_csv(df,name):
    df.to_csv(name, index = False ,encoding='iso8859-8',errors='replace')

In [252]:
df = data_cleaning(df)
df

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022?,40.0,"['John Cena', 'Danielle Brooks', 'Freddie Stro...",335.0,21.0,22000.0,8.4,13146488
1,Encanto,Jared Bush,Animation,2021,102.0,"['Stephanie Beatriz', 'Mar?a Cecilia Botero', ...",827.0,192.0,113000.0,7.3,2953050
2,After Life,Ricky Gervais,Comedy,2019?2022,30.0,"['Ricky Gervais', 'Tom Basden', 'Tony Way', 'D...",2000.0,70.0,112000.0,8.5,8398600
3,Don't Look Up,Adam McKay,Comedy,2021,18.0,"['Leonardo DiCaprio', 'Jennifer Lawrence', 'Me...",4100.0,260.0,388000.0,7.3,11286314
5,Cobra Kai,Josh Heald,Action,2018?,30.0,"['Ralph Macchio', 'William Zabka', 'Xolo Marid...",2200.0,152.0,155000.0,8.6,7221388
...,...,...,...,...,...,...,...,...,...,...,...
15580,,Andrzej Bartkowiak,Action,2009,96.0,"['Kristin Kreuk', 'Neal McDonough', 'Michael C...",201.0,96.0,24000.0,3.7,891592
15581,Stan Helsing,Bo Zenga,Comedy,2009,108.0,"['Steve Howey', 'Diora Baird', 'Kenan Thompson...",94.0,48.0,13000.0,3.6,1185266
15594,Fifty Shades Darker,James Foley,Drama,2017,118.0,"['Dakota Johnson', 'Jamie Dornan', 'Eric Johns...",325.0,246.0,99000.0,4.6,4465564
15612,Grease 2,Patricia Birch,Comedy,1982,115.0,"['Michelle Pfeiffer', 'Maxwell Caulfield', 'Lo...",338.0,32.0,35000.0,4.4,84021


* We can see that there is a large part of deleted data - the reason is that in any table of genre, the same series can appear more than once (because most series belong to more than one genre)

In [253]:
df = fix_data_value_cells(df)
df

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022,40,"['John Cena', 'Danielle Brooks', 'Freddie Stro...",335,21,22000,8.4,13146488
1,Encanto,Jared Bush,Animation,2021,102,"['Stephanie Beatriz', 'Mar?a Cecilia Botero', ...",827,192,113000,7.3,2953050
2,After Life,Ricky Gervais,Comedy,2019,30,"['Ricky Gervais', 'Tom Basden', 'Tony Way', 'D...",2000,70,112000,8.5,8398600
3,Don't Look Up,Adam McKay,Comedy,2021,18,"['Leonardo DiCaprio', 'Jennifer Lawrence', 'Me...",4100,260,388000,7.3,11286314
5,Cobra Kai,Josh Heald,Action,2018,30,"['Ralph Macchio', 'William Zabka', 'Xolo Marid...",2200,152,155000,8.6,7221388
...,...,...,...,...,...,...,...,...,...,...,...
15580,,Andrzej Bartkowiak,Action,2009,96,"['Kristin Kreuk', 'Neal McDonough', 'Michael C...",201,96,24000,3.7,891592
15581,Stan Helsing,Bo Zenga,Comedy,2009,108,"['Steve Howey', 'Diora Baird', 'Kenan Thompson...",94,48,13000,3.6,1185266
15594,Fifty Shades Darker,James Foley,Drama,2017,118,"['Dakota Johnson', 'Jamie Dornan', 'Eric Johns...",325,246,99000,4.6,4465564
15612,Grease 2,Patricia Birch,Comedy,1982,115,"['Michelle Pfeiffer', 'Maxwell Caulfield', 'Lo...",338,32,35000,4.4,84021


save the fix and clean DataFrame

In [254]:
save_csv(df_clean,"dfClean.csv")

Function that divides the columns and create a new DataFrame 

In [255]:
def divide_columns(df_before_split):
    TopCast_col = df_before_split["TopCast"].fillna("")
    #from the list of topCast we choose the 5 main starts
    stars_1 = []
    stars_2 = []
    stars_3 = []
    stars_4 = []
    stars_5 = []
   
    for row in TopCast_col:
        stars = row.split(",")
        if len(stars) < 5:
            for i in range((5 - len(stars))):
                stars.append("")
        stars_1.append(stars[0].strip("[.grt"))
        stars_2.append(stars[1])
        stars_3.append(stars[2])
        stars_4.append(stars[3])
        stars_5.append(stars[4])

   
    df_after_split_stars = pd.DataFrame(
        {"SeriesName": df["SeriesName"], "Creator": df["Creator"], "Genre": df['Genre'], "YearOfPublished": df["YearOfPublished"],
         "ChapterLengthInMiniutes": df["ChapterLengthInMiniutes"], "TopCast": df["TopCast"],
         "star_Num1": stars_1, "star_Num2": stars_2, "star_Num3": stars_3,"star_Num4": stars_4,"star_Num5": stars_5,
          "UserReviews": df["UserReviews"],"CriticReviews": df["CriticReviews"],"Views": df["Views"],"Rating": df["Rating"],"SeriesId": df["SeriesId"]})
    return df_after_split_stars

In [256]:
df_new = divide_columns(dfClean)
save_csv(df_new,"df_new.csv")

In [257]:
df_new

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,star_Num1,star_Num2,star_Num3,star_Num4,star_Num5,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022,40,"['John Cena', 'Danielle Brooks', 'Freddie Stro...",'John Cena','Danielle Brooks','Freddie Stroma','Chukwudi Iwuji','Jennifer Holland',335,21,22000,8.4,13146488
1,Encanto,Jared Bush,Animation,2021,102,"['Stephanie Beatriz', 'Mar?a Cecilia Botero', ...",'Stephanie Beatriz','Mar?a Cecilia Botero','John Leguizamo','Mauro Castillo','Jessica Darrow',827,192,113000,7.3,2953050
2,After Life,Ricky Gervais,Comedy,2019,30,"['Ricky Gervais', 'Tom Basden', 'Tony Way', 'D...",'Ricky Gervais','Tom Basden','Tony Way','Diane Morgan','Kerry Godliman',2000,70,112000,8.5,8398600
3,Don't Look Up,Adam McKay,Comedy,2021,18,"['Leonardo DiCaprio', 'Jennifer Lawrence', 'Me...",'Leonardo DiCaprio','Jennifer Lawrence','Meryl Streep','Cate Blanchett','Rob Morgan',4100,260,388000,7.3,11286314
5,Cobra Kai,Josh Heald,Action,2018,30,"['Ralph Macchio', 'William Zabka', 'Xolo Marid...",'Ralph Macchio','William Zabka','Xolo Maridue?a','Courtney Henggeler','Tanner Buchanan',2200,152,155000,8.6,7221388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15580,,Andrzej Bartkowiak,Action,2009,96,"['Kristin Kreuk', 'Neal McDonough', 'Michael C...",'Kristin Kreuk','Neal McDonough','Michael Clarke Duncan','Chris Klein','Robin Shou',201,96,24000,3.7,891592
15581,Stan Helsing,Bo Zenga,Comedy,2009,108,"['Steve Howey', 'Diora Baird', 'Kenan Thompson...",'Steve Howey','Diora Baird','Kenan Thompson','Desi Lydic','Leslie Nielsen',94,48,13000,3.6,1185266
15594,Fifty Shades Darker,James Foley,Drama,2017,118,"['Dakota Johnson', 'Jamie Dornan', 'Eric Johns...",'Dakota Johnson','Jamie Dornan','Eric Johnson','Eloise Mumford','Bella Heathcote',325,246,99000,4.6,4465564
15612,Grease 2,Patricia Birch,Comedy,1982,115,"['Michelle Pfeiffer', 'Maxwell Caulfield', 'Lo...",'Michelle Pfeiffer','Maxwell Caulfield','Lorna Luft','Maureen Teefy','Alison Price',338,32,35000,4.4,84021


* we will drop the Outliers values after using visualization - in the next part

End of Data Cleaning