In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('/home/riddhi/archive/movies.csv', encoding='latin1')


In [4]:
print(df.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [6]:
print(df.describe())

            Rating
count  7919.000000
mean      5.841621
std       1.381777
min       1.100000
25%       4.900000
50%       6.000000
75%       6.800000
max      10.000000


In [7]:
print(df.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [8]:
df = df.dropna() 

In [9]:
# Ensure the 'Year' column is of string type before applying .str.extract
df['Year'] = df['Year'].astype(str)  # Convert 'Year' column to string

# Extract year using regex and handle missing or invalid years
df['Year'] = df['Year'].str.extract('(\d{4})')  # Extract 4-digit year

# Convert the extracted year to numeric, and handle invalid parsing results as NaN
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')  # Convert to numeric, invalid parsing results in NaN

# Fill NaN values with a default year (you can replace this with a specific value if needed)
df['Year'] = df['Year'].fillna(df['Year'].mode()[0]).astype(int)  # Replace NaNs with the mode of the column


In [10]:
# Ensure 'Duration' is a string before extracting digits
df['Duration'] = df['Duration'].astype(str)  # Convert to string
df['Duration'] = df['Duration'].str.extract('(\d+)')  # Extract digits only
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')  # Convert to numeric
df['Duration'] = df['Duration'].fillna(df['Duration'].mode()[0]).astype(int)  # Fill NaN with the mode


In [11]:
# Ensure 'Votes' is a string before replacing commas
df['Votes'] = df['Votes'].astype(str)  # Convert to string
df['Votes'] = df['Votes'].str.replace(',', '')  # Remove commas
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')  # Convert to numeric
df['Votes'] = df['Votes'].fillna(df['Votes'].mode()[0]).astype(int)  # Fill NaN with the mode


In [12]:
# Handle 'Rating' column - Convert to numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')  # Convert to numeric
df['Rating'] = df['Rating'].fillna(df['Rating'].mode()[0])  # Fill NaN with the mode

In [13]:
# Select features and target
X = df[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Year', 'Duration', 'Votes']]
y = df['Rating']

In [14]:
# Convert categorical columns to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Train the model
model = LinearRegression()
model.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
# Make predictions
y_pred = model.predict(X_test)


In [18]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 9.289218889039033


In [19]:
# Optionally, print model coefficients to understand the influence of each feature
print("Model Coefficients:", model.coef_)

Model Coefficients: [-5.80225013e-02  9.53345613e-03  4.22584485e-05 ...  0.00000000e+00
  2.56244730e-01  0.00000000e+00]


In [20]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [21]:
# R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: -4.0164713179639415


In [22]:
# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 2.1641727009038054


In [23]:
# Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 3.047821991035407


In [29]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print("Cross-validated MSE:", -cv_scores.mean())


Cross-validated MSE: 308142248853611.3


In [31]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
print("Random Forest MSE:", mse_rf)


Random Forest MSE: 1.2264853056537102


In [33]:
# Example new data (replace this with actual new data)
new_data = pd.DataFrame({
    'Genre_Action': [1],  # Example for Genre_Action being 1
    'Genre_Adventure': [0],  # Genre_Adventure is 0
    'Director_Spike Lee': [1],  # Example for Director_Spike Lee being 1
    'Actor 1_Some Actor': [1],  # Actor 1_Some Actor is 1
    'Actor 2_Another Actor': [0],  # Actor 2_Another Actor is 0
    'Actor 3_Yet Another Actor': [0],  # Actor 3_Yet Another Actor is 0
    'Year': [2021],  # Example year
    'Duration': [120],  # Example duration
    'Votes': [3000]  # Example votes
})

In [34]:
# Ensure the columns in the new_data match the columns used during training
new_data = pd.get_dummies(new_data, drop_first=True)

In [35]:
# Make sure that new_data has the same number of columns as X_train
new_data = new_data.reindex(columns=X_train.columns, fill_value=0)

In [36]:
import joblib

# Save the trained model
joblib.dump(model, 'movie_rating_predictor.pkl')


['movie_rating_predictor.pkl']

In [37]:
# Load the trained model
loaded_model = joblib.load('movie_rating_predictor.pkl')


In [38]:
import os
print(os.getcwd())


/home/riddhi/archive


In [39]:
# Make predictions with the trained model
new_prediction = loaded_model.predict(new_data)

In [40]:
print("Predicted Rating for New Data:", new_prediction)

Predicted Rating for New Data: [4.71084291]


In [41]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)


R² Score: -4.0164713179639415


In [42]:
!pip install streamlit



In [45]:
import os
print(os.getcwd())


/home/riddhi/archive


In [46]:
import joblib

joblib.dump(model, 'linear_regression_model.pkl')


['linear_regression_model.pkl']

In [47]:
print(os.listdir())


['movie_rating_model.pkl', 'Task-2.ipynb', 'movie_rating_predictor.pkl', 'linear_regression_model.pkl', '.ipynb_checkpoints', 'movies.csv', 'movie_rating_app.py .ipynb']


In [48]:
loaded_model = joblib.load('linear_regression_model.pkl')


In [49]:
model_path = os.path.join(os.getcwd(), 'linear_regression_model.pkl')
joblib.dump(model, model_path)
loaded_model = joblib.load(model_path)
