<a href="https://colab.research.google.com/github/Vishwa1030/HTML-CSS/blob/main/Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Importing Libraries**

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files


# **Loading the Dataset**

In [None]:
movie= pd.read_csv('/content/task2.csv', encoding='latin1')
print(movie)

                                     Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes            Director           A

In [None]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [None]:
movie.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


# **Separating Features and Target Variable**

In [None]:
X_train = movie.drop('Rating', axis=1)
y_train = movie['Rating']

### **Defining Preprocessing Steps**

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])



### **Imputing Missing Values in the Target Variable**

In [None]:
y_train_imputed = numeric_transformer.fit_transform(y_train.values.reshape(-1, 1))

### **Split the Data into Training and Testing Sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train_imputed, test_size=0.2, random_state=42)

### **Train the Model**

In [None]:
model = LinearRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(X_train, y_train)

### **Make Predictions and Evaluate the Model**

In [None]:
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.9412720565229539
R^2 Score: 0.022032016035517965


### **Use the Model for Prediction on New Data**

In [None]:
new_data = pd.DataFrame({
    'Genre': ['Action'],
    'Director': ['Christopher Nolan'],
    'Actor 3': ['Christian Bale, Heath Ledger']})

new_data = new_data.reindex(columns=X_train.columns, fill_value=0)


new_data_imputed = pipeline.named_steps['preprocessor'].transform(new_data)


predicted_rating = pipeline.named_steps['model'].predict(new_data_imputed)

print(f'Predicted Rating: {predicted_rating[0]}')


Predicted Rating: [6.32567529]
