<a href="https://colab.research.google.com/github/Sara102006/CODSOFT/blob/main/Codesoft2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Movie Rating Prediction using Regression

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load dataset
data = pd.read_csv("Movies.csv", encoding="ISO-8859-1")

# 2. Normalize column names
data.columns = data.columns.str.strip().str.lower()

print("Columns:", data.columns)

# 3. Drop rows with missing required values
data = data.dropna(subset=['genre', 'director', 'actor 1', 'rating'])

# 4. Features and target
X = data[['genre', 'director', 'actor 1']]
y = data['rating']

# 5. One-Hot Encoding
categorical_features = ['genre', 'director', 'actor 1']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# 6. Build pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 8. Train model
model.fit(X_train, y_train)

# 9. Predict
y_pred = model.predict(X_test)

# 10. Evaluate
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 11. Predict rating for a new movie
new_movie = pd.DataFrame({
    'genre': ['Action'],
    'director': ['Christopher Nolan'],
    'actor 1': ['Leonardo DiCaprio']
})

predicted_rating = model.predict(new_movie)
print("Predicted Rating:", predicted_rating[0])

Columns: Index(['name', 'year', 'duration', 'genre', 'rating', 'votes', 'director',
       'actor 1', 'actor 2', 'actor 3'],
      dtype='object')
Mean Squared Error: 2.292893597497809
R2 Score: -0.1881077082661322
Predicted Rating: 5.388914208161257
