# Prep

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file
file_path = '../../preped.csv'
df = pd.read_csv(file_path)

In [None]:
df['Release Year'] = pd.to_datetime(df['Release Date'],unit='s').dt.year
df['Release Month'] = pd.to_datetime(df['Release Date'],unit='s').dt.month
df['Release Month']

In [None]:
# Define age groups
def age_group(age):
    if age == 0:
        return 'All'
    elif 1 <= age <= 16:
        return 'Teen'
    else:
        return 'Adult'
df['Age Group'] = df['Minimum Age'].apply(age_group)

df['Age Group']

# Default

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


target = df['Age Group']
features = df.drop(columns=['Minimum Age', 'Age Group']).select_dtypes(include=[int, float])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')

# Scaled

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')

# Visualize

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create heatmap of confusion matrix
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Adult', 'All', 'Teen'],
            yticklabels=['Adult', 'All', 'Teen'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature Importances

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(features.shape[1]), importances[indices], align="center")
plt.xticks(range(features.shape[1]), [features.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features.shape[1]])
plt.show()

# Without unimportant Features

In [None]:
# Define the threshold for low importance
threshold = 0.01

# Filter out features with importance below the threshold
important_features = features.columns[importances > threshold]

# Update the features dataframe to include only important features
features_important = features[important_features]

# Split the data into training and testing sets with important features
X_train_important, X_test_important, y_train, y_test = train_test_split(features_important, target, test_size=0.2, random_state=42)

# Create the decision tree classifier
clf_important = DecisionTreeClassifier(random_state=42)

# Train the classifier
clf_important.fit(X_train_important, y_train)

# Make predictions
y_pred_important = clf_important.predict(X_test_important)

# Calculate the accuracy
accuracy_important = accuracy_score(y_test, y_pred_important)
print(f'Accuracy with important features: {accuracy_important:.3f}')

# Imoprtant Features

In [None]:
import matplotlib.pyplot as plt

importances = clf_important.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(features_important.shape[1]), importances[indices], align="center")
plt.xticks(range(features_important.shape[1]), [features_important.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features_important.shape[1]])
plt.show()

# Optimize using HalvingGridSearchCV

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],  # Explore different numbers of trees
    'max_depth': [None, 10, 20],      # Explore different tree depths
    'min_samples_split': [2, 5, 10],   # Explore different minimum samples to split
    'min_samples_leaf': [1, 2, 4],     # Explore different minimum samples per leaf
    'criterion': ['gini', 'entropy']  # Explore different splitting criteria
}

# Initialize HalvingGridSearchCV
halving_cv = HalvingGridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,                # 5-fold cross-validation
    scoring='accuracy',  # Optimize for accuracy
    n_jobs=-1,           # Use all available cores
    verbose=0,           # Don't print progress updates
    factor=2,            # Reduce the number of candidates by half in each iteration
    resource='n_samples' # Using n_samples as the resource parameter
)


# Fit the model using HalvingGridSearchCV
halving_cv.fit(X_train, y_train)

# Get the best model from HalvingGridSearchCV
best_rf_model = halving_cv.best_estimator_

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
print("Best Hyperparameters:", halving_cv.best_params_)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


# Feature Importance Analysis and Removal (Optional)
feature_importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xticks(rotation=90)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importances")
plt.tight_layout()
plt.show()

# Optimize without unimportant Features

In [None]:
low_importance_threshold = 0.01  # Adjust the threshold as needed
features_to_remove = feature_importance_df[feature_importance_df['Importance'] < low_importance_threshold]['Feature'].tolist()

print(f"Features to remove: {features_to_remove}")

# Remove low-importance features and retrain (optional)
if features_to_remove:
    X_train_reduced = X_train.drop(features_to_remove, axis=1)
    X_test_reduced = X_test.drop(features_to_remove, axis=1)


    halving_cv_reduced = HalvingGridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,                # 5-fold cross-validation
        scoring='accuracy',  # Optimize for accuracy
        n_jobs=-1,           # Use all available cores
        verbose=0,           # Don't print progress updates
        factor=2,            # Reduce the number of candidates by half in each iteration
        resource='n_samples' # Using n_samples as the resource parameter
    )

    halving_cv_reduced.fit(X_train_reduced, y_train)
    best_rf_model_reduced = halving_cv_reduced.best_estimator_
    y_pred_reduced = best_rf_model_reduced.predict(X_test_reduced)

    print("Reduced Model Best Hyperparameters:", halving_cv_reduced.best_params_)
    print("Reduced Model Classification Report:\n", classification_report(y_test, y_pred_reduced))
    print("Reduced Model Accuracy:", accuracy_score(y_test, y_pred_reduced))


# Visualize Best Model

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Adult', 'All', 'Teen'],
            yticklabels=['Adult', 'All', 'Teen'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Visualize Best Model Without Unimportant Features

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

import matplotlib.pyplot as plt


# Create confusion matrix
cm = confusion_matrix(y_test, y_pred_reduced)

# Create heatmap of confusion matrix
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Adult', 'All', 'Teen'],
            yticklabels=['Adult', 'All', 'Teen'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Classification Report:
               precision    recall  f1-score   support

       Adult       0.67      0.72      0.69       216
         All       0.67      0.48      0.56        58
        Teen       0.72      0.72      0.72       228

    accuracy                           0.69       502
   macro avg       0.68      0.64      0.66       502
weighted avg       0.69      0.69      0.69       502

Accuracy: 0.6912350597609562

Features to remove: ['Sci-Fi', 'Runtime', 'Sport', 'Biography', 'Mystery', 'Documentary', 'Musical', 'Music', 'History', 'War', 'Western', 'Is Series', 'News']

Reduced Model Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Reduced Model Classification Report:
               precision    recall  f1-score   support

       Adult       0.66      0.70      0.68       216
         All       0.67      0.45      0.54        58
        Teen       0.71      0.73      0.72       228

    accuracy                           0.68       502
   macro avg       0.68      0.63      0.64       502
weighted avg       0.68      0.68      0.68       502

Reduced Model Accuracy: 0.6832669322709163