In [17]:
                                                     IMBD Movie Rating - Task 2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the Dataset
file_path = 'IMDb Movies India.csv'
data = pd.read_csv(file_path ,  encoding='latin1')

# Display the first few rows of the dataset
print(data.head())

#  Preprocess the Data
data = data.dropna()

# Select relevant features for the model
# Assuming 'genre', 'director', 'actors', and 'rating' are available columns
data = data[['Genre', 'Director', 'Actor 1', 'Actor 2','Actor 3','Rating']]

# Convert the rating to categories: 'Low', 'Medium', 'High'
bins = [0, 5, 7, 10]
labels = ['Low', 'Medium', 'High']
data['Rating_category'] = pd.cut(data['Rating'], bins=bins, labels=labels)

# Encode categorical features
label_encoders = {}
for column in ['Genre', 'Director', 'Actor 1','Actor 2','Actor 3']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Encode the target variable
label_encoder_y = LabelEncoder()
data['Rating_category'] = label_encoder_y.fit_transform(data['Rating_category'])

# Separate features and target variable
X = data[['Genre', 'Director', 'Actor 1','Actor 2','Actor 3']]
y = data['Rating_category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

#  Evaluate the Model
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print(classification_report(y_test, y_pred, target_names=label_encoder_y.classes_))

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('IMDb Movies India')
print(cm)

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    