# This is a Project that shows correlation using Python

In [120]:
# Importing Libraries

import numpy as np
import pandas as pd

import seaborn as sns

import matplotlib
import matplotlib as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure


%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 8)  # Adjusts the figsize of the graphs


# Reading in the data

df = pd.read_csv('movies.csv')

In [None]:
# Optional 
# Removing the limit of rows shown in the Jupyter notebook
pd.set_option('display.max_rows', None)

## Looking at the Data

In [121]:
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


In [123]:
df.describe()

Unnamed: 0,year,score,votes,budget,gross,runtime
count,7668.0,7665.0,7665.0,5497.0,7479.0,7664.0
mean,2000.405451,6.390411,88108.5,35589880.0,78500540.0,107.261613
std,11.153508,0.968842,163323.8,41457300.0,165725100.0,18.581247
min,1980.0,1.9,7.0,3000.0,309.0,55.0
25%,1991.0,5.8,9100.0,10000000.0,4532056.0,95.0
50%,2000.0,6.5,33000.0,20500000.0,20205760.0,104.0
75%,2010.0,7.1,93000.0,45000000.0,76016690.0,116.0
max,2020.0,9.3,2400000.0,356000000.0,2847246000.0,366.0


## Nulls and Missing Data 

We can distinguish there are a few null values in some columns but in the budget field we have more than 2100 values that are null. We need to clean those values/rows.

In [124]:
for col in df.columns:
    missing_values = df[col].isna().sum()
    print(f'{col}: {missing_values}')

name: 0
rating: 77
genre: 0
year: 0
released: 2
score: 3
votes: 3
director: 0
writer: 3
star: 1
country: 3
budget: 2171
gross: 189
company: 17
runtime: 4


In [125]:
# Data Types of our DataSet

df.dtypes

name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object

In [126]:
# Droping the N/A Values.

df.dropna(how = 'any', axis = 0, inplace = True)

In [127]:
# Clearing the data types

df['budget'] = df['budget'].astype('int64')

df['gross'] = df['gross'].astype('int64')

In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5421 entries, 0 to 7652
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      5421 non-null   object 
 1   rating    5421 non-null   object 
 2   genre     5421 non-null   object 
 3   year      5421 non-null   int64  
 4   released  5421 non-null   object 
 5   score     5421 non-null   float64
 6   votes     5421 non-null   float64
 7   director  5421 non-null   object 
 8   writer    5421 non-null   object 
 9   star      5421 non-null   object 
 10  country   5421 non-null   object 
 11  budget    5421 non-null   int64  
 12  gross     5421 non-null   int64  
 13  company   5421 non-null   object 
 14  runtime   5421 non-null   float64
dtypes: float64(3), int64(3), object(9)
memory usage: 677.6+ KB


In [132]:
# Extracts the Correct Year from the released column

def extract_year(string):
    # Finds the year with string manipulation
    year = string.split(' ')[2:3]
    return ''.join(year)

# Creation of the correctYear Column

df['yearcorrect'] = df['released'].astype(str).apply(extract_year)
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,yearcorrect
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772,Warner Bros.,146.0,1980
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106,Columbia Pictures,104.0,1980
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067,Lucasfilm,124.0,1980
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539,Paramount Pictures,88.0,1980
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344,Orion Pictures,98.0,1980


In [152]:
#Find duplicates by comparing the original with the next df. 

df.nunique() # Counts unique Names

name           5336
rating           10
genre            15
year             41
released       2560
score            72
votes           717
director       2063
writer         3205
star           1845
country          50
budget          398
gross          5418
company        1475
runtime         127
yearcorrect      45
dtype: int64

In [157]:
# Count unique values after removing duplicates

df.drop_duplicates().sort_values(by = ['gross'], ascending=False).nunique()
# There are no differences so there are not duplicates.

name           5336
rating           10
genre            15
year             41
released       2560
score            72
votes           717
director       2063
writer         3205
star           1845
country          50
budget          398
gross          5418
company        1475
runtime         127
yearcorrect      45
dtype: int64

## Discovering Correlations by visualization