In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = '/content/movies.csv'
movies_df = pd.read_csv(file_path, encoding='ISO-8859-1')

In [3]:
# Display the first few rows to understand the structure
print(movies_df.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [4]:
# Remove rows with missing target values (Rating)
movies_df_cleaned = movies_df.dropna(subset=['Rating'])

In [6]:
# Clean the 'Year' column by removing parentheses and converting to numeric
movies_df_cleaned['Year'] = movies_df_cleaned['Year'].str.extract('(\d{4})').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_cleaned['Year'] = movies_df_cleaned['Year'].str.extract('(\d{4})').astype(float)


In [8]:
# Clean the 'Votes' column by removing commas and converting to numeric
movies_df_cleaned['Votes'] = movies_df_cleaned['Votes'].str.replace(',', '').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_cleaned['Votes'] = movies_df_cleaned['Votes'].str.replace(',', '').astype(float)


In [14]:
# Impute missing values for numerical columns
movies_df_cleaned['Year'].fillna(movies_df_cleaned['Year'].median(), inplace=True)
movies_df_cleaned['Votes'].fillna(movies_df_cleaned['Votes'].median(), inplace=True)
movies_df_cleaned['Duration'] = movies_df_cleaned['Duration'].str.replace(' min', '').astype(float)
movies_df_cleaned['Duration'].fillna(movies_df_cleaned['Duration'].median(), inplace=True)

# Impute missing values for categorical columns
movies_df_cleaned['Genre'].fillna(movies_df_cleaned['Genre'].mode()[0], inplace=True)
movies_df_cleaned['Director'].fillna(movies_df_cleaned['Director'].mode()[0], inplace=True)
movies_df_cleaned['Actor 1'].fillna(movies_df_cleaned['Actor 1'].mode()[0], inplace=True)
movies_df_cleaned['Actor 2'].fillna(movies_df_cleaned['Actor 2'].mode()[0], inplace=True)
movies_df_cleaned['Actor 3'].fillna(movies_df_cleaned['Actor 3'].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_cleaned['Year'].fillna(movies_df_cleaned['Year'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_cleaned['Votes'].fillna(movies_df_cleaned['Votes'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df_cleaned['Duration'] = movies_df_cleaned['Duration'].str.replace(' min', '').astype(float)
A value is trying to be set on a copy of a slice fr

In [15]:
# One-hot encode categorical features
movies_df_encoded = pd.get_dummies(movies_df_cleaned, columns=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'])

# Ensure there are no remaining NaNs
movies_df_encoded.fillna(0, inplace=True)

In [16]:
# Define features and target variable
X = movies_df_encoded.drop(columns=['Name', 'Rating'])
y = movies_df_encoded['Rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 8529802014834.831
R-squared: -4588023725724.286
