# Linear Regression on IMDb Movies India Dataset
## Done By: Muhammad Uzair Saleem
## Company: CodSoft
## Batch: Dec Batch A22  

In [31]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


### Checking dataset encoding type 

In [32]:
import chardet

with open('IMDb Movies India.csv', 'rb') as file:
    result = chardet.detect(file.read())

encoding = result['encoding']
print(f"The detected encoding is: {encoding}")


The detected encoding is: ISO-8859-1


In [33]:
df = pd.read_csv('IMDb Movies India.csv',encoding='ISO-8859-1')

In [34]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


### Converting values into numeric form

In [35]:
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(r'\D','',regex=True))
df.dropna(subset=['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)
df['Genre'] = df['Genre'].str.split(',')
df = df.explode('Genre')
df['Genre'] = df['Genre'].fillna('Mode')
df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',',''),errors='coerce')
df['Year'] = pd.to_numeric(df['Year'].str.replace(r'[()]','',regex=True))


### Creating new columns of string values also converting them into mean

In [36]:
df['genre_mean_rating'] = df.groupby('Genre')['Rating'].transform('mean')
df['director'] = df.groupby('Director')['Rating'].transform('mean')
df['actor_1'] = df.groupby('Actor 1')['Rating'].transform('mean')
df['actor_2'] = df.groupby('Actor 2')['Rating'].transform('mean')
df['actor_3'] = df.groupby('Actor 3')['Rating'].transform('mean')
df['Name'] = df.groupby('Name')['Rating'].transform('mean')

In [37]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,genre_mean_rating,director,actor_1,actor_2,actor_3
1,7.0,2019,109.0,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,6.248697,7.0,6.85,7.0,7.0
3,4.4,2019,110.0,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.838423,4.4,5.25,4.4,4.46
3,4.4,2019,110.0,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,5.838739,4.4,5.25,4.4,4.46
5,4.7,1997,147.0,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,5.838423,5.335135,4.793617,5.73,5.93
5,4.7,1997,147.0,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,5.875793,5.335135,4.793617,5.73,5.93


In [38]:
rating = df['Rating'].value_counts()
print(rating)

Rating
6.8     437
6.5     416
6.2     404
7.1     337
6.6     336
       ... 
10.0      2
9.7       2
9.2       2
1.4       2
1.1       2
Name: count, Length: 83, dtype: int64


In [None]:
df.describe()

Unnamed: 0,Name,Year,Duration,Rating,Votes,genre_mean_rating,director,actor_1,actor_2,actor_3
count,12008.0,12008.0,12008.0,12008.0,12008.0,12008.0,12008.0,12008.0,12008.0,12008.0
mean,5.880446,1996.000083,135.887658,5.880446,3327.388991,5.880446,5.880446,5.880446,5.880446,5.880446
std,1.341068,19.429043,25.007962,1.373414,15288.308913,0.344685,1.097004,1.001344,1.032083,1.04528
min,1.1,1931.0,21.0,1.1,5.0,4.333333,1.6,1.4,1.6,1.7
25%,5.0,1982.0,120.0,5.0,41.0,5.674915,5.246154,5.3,5.28,5.279167
50%,6.08,2001.0,137.0,6.1,211.0,5.838739,6.0,6.0,5.945455,5.96
75%,6.875,2012.0,152.0,6.9,1457.0,6.181905,6.7,6.575281,6.563636,6.57541
max,10.0,2021.0,321.0,10.0,591417.0,8.0,10.0,10.0,10.0,10.0


In [None]:
df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3', 'genre_mean_rating', 'director',
       'actor_1', 'actor_2', 'actor_3'],
      dtype='object')

In [None]:
x = df[['Name','Year', 'Duration', 'Votes','genre_mean_rating','director','actor_1','actor_2','actor_3']]
y = df['Rating']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
model = LinearRegression()
model.fit(x_train,y_train)

In [None]:
prediction = model.predict(x_test)
r2_score(y_test,prediction)

0.9602817871360783