#Imports

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Load Dataset

In [73]:
df = pd.read_csv('IMDb-Movies-India.csv', encoding='latin1')
print(df.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

#Data Pre-Processing

In [74]:
df['Year'] = df['Year'].str.extract('(\d+)').astype(float)
df['Duration'] = df['Duration'].str.extract('(\d+)').astype(float)
df['Rating'].fillna(df['Rating'].median(), inplace=True)
df['Genre'] = df['Genre'].apply(lambda x: ','.join(map(str.strip, x.split(','))) if isinstance(x, str) else None)

In [75]:
df['Votes'] = pd.to_numeric(df['Votes'].str.replace('[^\d.]', '', regex=True), errors='coerce')
df['Votes'].fillna(df['Votes'].median(), inplace=True)

In [77]:
X = df[['Year', 'Duration', 'Votes', 'Genre']]
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
numeric_features = ['Year', 'Duration', 'Votes']
categorical_features = ['Genre']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#Model Training

In [79]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

model.fit(X_train, y_train)

#Testing and Evaluation

In [80]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 0.89
R-squared: 0.09


#Prediction using trained model

In [81]:
def predict_rating(genre, duration):
    input_data = pd.DataFrame({
        'Year': [2022],
        'Duration': [duration],
        'Votes': [df['Votes'].median()],
        'Genre': [genre]
    })

    rating_prediction = model.predict(input_data)[0]
    return rating_prediction

In [82]:
genre_input = input("Enter the genre: ")
duration_input = int(input("Add Duration in mins.: "))

predicted_rating = predict_rating(genre_input, duration_input)
print(f"Predicted Rating for Genre '{genre_input}' and Duration {duration_input} minutes: {predicted_rating:.2f}")

Enter the genre: Romance, Comedy
Add Duration in mins.: 100
Predicted Rating for Genre 'Romance, Comedy' and Duration 100 minutes: 5.72
