# Data Analysis

In [4]:
import pandas as pd

# Load the dataset with a different encoding
file_path = r"S:\Codsoft\archive (2)_movie\IMDb Movies India.csv"
movies_data = pd.read_csv(file_path, encoding='ISO-8859-1')
print(movies_data.head())


                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

# Random Forest Regressor

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset with a different encoding
file_path = r"S:\Codsoft\archive (2)_movie\IMDb Movies India.csv"
movies_data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset
print(movies_data.head())

# Dropping columns that won't be used for prediction
movies_data_cleaned = movies_data.drop(['Name', 'Year', 'Duration', 'Votes'], axis=1)

# Handling missing values
movies_data_cleaned.fillna(movies_data_cleaned.mode().iloc[0], inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
movies_data_cleaned['Genre'] = label_encoder.fit_transform(movies_data_cleaned['Genre'])
movies_data_cleaned['Director'] = label_encoder.fit_transform(movies_data_cleaned['Director'])
movies_data_cleaned['Actor 1'] = label_encoder.fit_transform(movies_data_cleaned['Actor 1'])
movies_data_cleaned['Actor 2'] = label_encoder.fit_transform(movies_data_cleaned['Actor 2'])
movies_data_cleaned['Actor 3'] = label_encoder.fit_transform(movies_data_cleaned['Actor 3'])

# Splitting the data into features and target variable
X = movies_data_cleaned[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = movies_data_cleaned['Rating']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initializing the Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Training the model
rf_regressor.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = rf_regressor.predict(X_test)

# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Displaying the evaluation metrics
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R²: {r2}')


                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    