# Data

In [1]:
import pandas as pd

In [2]:
basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep='\t',
                    low_memory=False)
ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz",
                      sep='\t', low_memory=False)
akas = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz", sep='\t',
                    low_memory=False)

In [3]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9446373 entries, 0 to 9446372
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 648.6+ MB


Some columns need to be converted to numerical

## Cleaning Data

### basics

In [4]:
basics.duplicated().sum()

0

In [5]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [6]:
basics.dropna(inplace=True)
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [7]:
basics['titleType'].value_counts()

tvEpisode       7140494
short            903947
movie            629554
video            268393
tvSeries         235544
tvMovie          138663
tvMiniSeries      46345
tvSpecial         39562
videoGame         33079
tvShort           10769
tvPilot               2
Name: titleType, dtype: int64

In [8]:
basics = basics[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    629554
Name: titleType, dtype: int64

In [9]:
basics = basics[basics['genres'] != 'Documentary']
basics['genres'].value_counts()

Drama                        116878
\N                            71509
Comedy                        44608
Horror                        15810
Action                        14338
                              ...  
Action,History,Music              1
Biography,Romance,Western         1
Adventure,Music,Musical           1
Adult,Drama,Reality-TV            1
Biography,Fantasy,Musical         1
Name: genres, Length: 1470, dtype: int64

In [None]:
# Function to turn columns that should be numeric but aren't
# into numeric columns
def num(i):
    basics[i] = pd.to_numeric(basics[i], errors='coerce')
    basics.dropna(inplace=True)
    basics[i] = basics[i].astype(int)

In [None]:
basics.info()

In [None]:
num('startYear')
num('runtimeMinutes')
num('isAdult')

In [None]:
# Shrinking the range of values for releases to movies between
# 2000 up to 2021
basics = basics[basics['startYear'] >= 2000]
basics = basics[basics['startYear'] <= 2021]
print(min(basics['startYear']))
max(basics['startYear'])

In [None]:
# Cheking endYear Column
basics['endYear'].value_counts()

In [None]:
# Dropping the empty column
basics.drop(columns='endYear', inplace=True)

In [None]:
basics.info()

### ratings

In [13]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1257088 entries, 0 to 1257087
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1257088 non-null  object 
 1   averageRating  1257088 non-null  float64
 2   numVotes       1257088 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


### akas

In [14]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34183552 entries, 0 to 34183551
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


In [15]:
akas['region'].value_counts()

DE    4078518
JP    4077310
FR    4076684
IN    4011862
ES    3999917
       ...   
CC          1
TV          1
NU          1
PW          1
NR          1
Name: region, Length: 248, dtype: int64

In [17]:
akas['region'] = akas[akas['region'] == 'US']
akas['region'].value_counts()

ValueError: Columns must be same length as key