In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import regex as re
import string
import matplotlib as plt
import seaborn as sns
from currency_converter import CurrencyConverter

In [2]:
df_mov_nam = pd.read_csv('../../data/imdb_movies.csv')
# Data has been scraped from the publicly available website https://www.imdb.com.
# All the movies with more than 100 votes have been scraped as of 01/01/2020.

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df_mov_nam.dtypes
# voting columns have different data types

imdb_title_id             object
title                     object
original_title            object
year                      object
date_published            object
genre                     object
duration                   int64
country                   object
language                  object
director                  object
writer                    object
production_company        object
actors                    object
description               object
avg_vote                 float64
votes                      int64
budget                    object
usa_gross_income          object
worlwide_gross_income     object
metascore                float64
reviews_from_users       float64
reviews_from_critics     float64
dtype: object

In [4]:
df_mov_nam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               85022 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

In [5]:
df_mov_nam.isnull().sum()
# Do we actually need income columns? I am going to leave them for now
# We must no something about NaNs in the reviews columns and others like country, language, description

imdb_title_id                0
title                        0
original_title               0
year                         0
date_published               0
genre                        0
duration                     0
country                     64
language                   833
director                    87
writer                    1572
production_company        4455
actors                      69
description               2115
avg_vote                     0
votes                        0
budget                   62145
usa_gross_income         70529
worlwide_gross_income    54839
metascore                72550
reviews_from_users        7597
reviews_from_critics     11797
dtype: int64

In [6]:
list(df_mov_nam["description"].sample())

["A meek tailor thinks his wooing will be helped if he assumes the identity of the famous Casanova...who's deeply in debt."]

In [7]:
list(df_mov_nam['country'].sample(10))
# Movies can have more than one country of origin

['UK, USA', 'USA', 'USA', 'UK', 'USA', 'USA', 'India', 'USA', 'USA', 'USA']

In [8]:
print(df_mov_nam["date_published"].unique())
# Month and day are not important for the study so we are going to drop this column

['1894-10-09' '1906-12-26' '1911-08-19' ... '2020-10-22' '2019-01-13'
 '2020-09-04']


In [9]:
# I drop the columns I do not need for the description based recommendation film system
df_mov = df_mov_nam.drop("title", axis=1)

In [10]:
df_mov = df_mov_nam.drop("date_published", axis=1)

In [11]:
# Here I create another dataframe to save for posiible future use containing the film income numeric values
df_film_income = pd.DataFrame(df_mov, columns= ["budget", "usa_gross_income", "worlwide_gross_income"])

In [12]:
df_film_income.to_csv('../../data/df_film_income.csv')

In [13]:
df_mov = df_mov.drop("budget", axis=1)
df_mov = df_mov.drop("usa_gross_income", axis=1)
df_mov = df_mov.drop("worlwide_gross_income", axis=1)

In [14]:
df_mov = df_mov.drop("title", axis=1)

year is an important column that must be cleaned

In [15]:
type(df_mov["year"].unique())

numpy.ndarray

In [16]:
def cleanstryear(x):
    '''
    This function clean my column year database from string characters
    '''
    if x == 'TV Movie 2019':
        return 2019
    else:
        return x

In [17]:
df_mov.year=df_mov.year.apply(cleanstryear)

In [18]:
# Tranform all values in year to INTS
df_mov["year"] = df_mov.year.astype(int)

In [19]:
print(df_mov["year"].unique())

[1894 1906 1911 1912 1919 1913 1914 1915 1916 1917 1918 1920 1921 1924
 1922 1923 1925 1926 1935 1927 1928 1983 1929 1930 1932 1931 1937 1938
 1933 1934 1936 1940 1939 1942 1943 1941 1948 1944 2001 1946 1945 1947
 1973 1949 1950 1952 1951 1962 1953 1954 1955 1961 1956 1958 1957 1959
 1960 1963 1965 1971 1964 1966 1968 1967 1969 1976 1970 1979 1972 1981
 1978 2000 1989 1975 1974 1986 1990 2018 1977 1982 1980 1993 1984 1985
 1988 1987 2005 1991 2002 1994 1992 1995 2017 1997 1996 2006 1999 1998
 2007 2008 2003 2004 2010 2009 2011 2013 2012 2016 2015 2014 2019 2020]


NaN treatment

In [20]:
# Filling string NaNs with 0 in important columns like country, language, director, prod_comp, actors, description and writer which have a small number of NaNs to address
df_mov.fillna({"language": 0,
                "director": 0,
                "country": 0,
                "writer": 0,
                "production_company": 0,
                "actors": 0,
                "description": 0})
df_mov.head(2)

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,1894,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,,7.0,7.0


In [21]:
df_mov = df_mov.fillna({"language": 0,
                                "director": 0,
                                "country": 0,
                                "writer": 0,
                                "production_company": 0,
                                "actors": 0,
                                "description": 0})

In [22]:
df_mov[["country", "language", "director", "writer", "production_company", "actors", "description"]].isnull().sum()

country               0
language              0
director              0
writer                0
production_company    0
actors                0
description           0
dtype: int64

In [23]:
len(list(df_mov["genre"].unique()))

1257

In [24]:
len(df_mov["duration"].unique())

266

In [25]:
len(df_mov["country"].unique())

4908

In [26]:
len(df_mov["language"].unique())

4378

Exploring voting/review numeric values

In [27]:
# No more NaNs
df_mov.isnull().sum()

imdb_title_id               0
original_title              0
year                        0
genre                       0
duration                    0
country                     0
language                    0
director                    0
writer                      0
production_company          0
actors                      0
description                 0
avg_vote                    0
votes                       0
metascore               72550
reviews_from_users       7597
reviews_from_critics    11797
dtype: int64

In [28]:
# I create another seperate dataframe to safe containing the number of votes and reviews
df_reviews = pd.DataFrame(df_mov, columns=["avg_vote", "votes", "metascore", "reviews_from_users", "reviews_from_critics"])

In [29]:
df_reviews.to_csv('../../data/df_reviews.csv')

In [30]:
df_mov = df_mov.drop("avg_vote", axis=1)
df_mov = df_mov.drop("votes", axis=1)
df_mov = df_mov.drop("metascore", axis=1)
df_mov = df_mov.drop("reviews_from_users", axis=1)
df_mov = df_mov.drop("reviews_from_critics", axis=1)

In [31]:
df_mov.head()

Unnamed: 0,imdb_title_id,original_title,year,genre,duration,country,language,director,writer,production_company,actors,description
0,tt0000009,Miss Jerry,1894,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...
2,tt0001892,Den sorte drøm,1911,Drama,53,"Germany, Denmark",0,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...
3,tt0002101,Cleopatra,1912,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...


In [32]:
df_mov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   imdb_title_id       85855 non-null  object
 1   original_title      85855 non-null  object
 2   year                85855 non-null  int64 
 3   genre               85855 non-null  object
 4   duration            85855 non-null  int64 
 5   country             85855 non-null  object
 6   language            85855 non-null  object
 7   director            85855 non-null  object
 8   writer              85855 non-null  object
 9   production_company  85855 non-null  object
 10  actors              85855 non-null  object
 11  description         85855 non-null  object
dtypes: int64(2), object(10)
memory usage: 7.9+ MB


In [33]:
type(df_mov.language[0])

str

In [34]:
df_mov["language"] = df_mov["language"].str.replace('None', '0')

In [35]:
df_mov[['language']] = df_mov[['language']].fillna(value=0)

In [36]:
df_mov['duration'].max()

808

In [37]:
df_mov['duration'].min()

41

In [38]:
# Bins for film duration
bin_interval = [0, 60, 90, 120, 150, 180, 808]

In [39]:
labels = ["0 < 1h", "1h < 1h30m", "1h30m < 2h", "2h < 2h30m", "2h30m < 3h", "3h+"]

In [40]:
df_mov['duration_sets'] = pd.cut(df_mov['duration'], bins=bin_interval, labels=labels)

In [42]:
df_mov.to_csv("../../data/imdb_movies_clean_1st.csv")

In [None]:
#df_mov["duration_sets"] = df_mov