# Prep

In [None]:
import pandas as pd

# Read the CSV file
file_path = '../preped.csv'
df = pd.read_csv(file_path)

In [None]:
df['Release Year'] = pd.to_datetime(df['Release Date'],unit='s').dt.year
df['Release Month'] = pd.to_datetime(df['Release Date'],unit='s').dt.month
df['Release Month']

In [None]:
# Define age groups
def age_group(age):
    if age == 0:
        return 'All'
    elif 1 <= age <= 16:
        return 'Teen'
    else:
        return 'Adult'
df['Age Group'] = df['Minimum Age'].apply(age_group)

df['Age Group']

# Default

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import classification_report, accuracy_score

# Define the feature columns and target column
features = df.drop(columns=['Minimum Age', 'Age Group']).select_dtypes(include=[int, float])
target_column = 'Age Group'

# Separate the features and target variable
X = features
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create heatmap of confusion matrix
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Adult', 'All', 'Teen'],
            yticklabels=['Adult', 'All', 'Teen'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature Importance

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(features.shape[1]), importances[indices], align="center")
plt.xticks(range(features.shape[1]), [features.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features.shape[1]])
plt.show()

# With Important Features Only

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np

# Identify unimportant features based on a threshold (e.g., importance < 0.01)
important_features_indices = indices[importances[indices] > 0.01]
important_features = features.columns[important_features_indices]

# Subset the features DataFrame to include only important features
X_important = features[important_features]

# Split the data into training and testing sets
X_train_important, X_test_important, y_train, y_test = train_test_split(X_important, y, test_size=0.2, random_state=42)

# Create and train the Random Forest model with important features
rf_model_important = RandomForestClassifier(random_state=42)
rf_model_important.fit(X_train_important, y_train)

# Make predictions
y_pred_important = rf_model_important.predict(X_test_important)

# Evaluate the model
print("Accuracy (Important Features Only):", accuracy_score(y_test, y_pred_important))
print(classification_report(y_test, y_pred_important))

In [None]:
import numpy as np

import matplotlib.pyplot as plt

importances = rf_model_important.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances (Important Features Only)")
plt.bar(range(X_important.shape[1]), importances[indices], align="center")
plt.xticks(range(X_important.shape[1]), [important_features[i] for i in indices], rotation=90)
plt.xlim([-1, X_important.shape[1]])
plt.show()

# Optimize using HalvingGridSearchCV

In [None]:
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np

# ... (Load data and split into X_train, X_test, y_train, y_test as before)

# Define the parameter grid *without* n_estimators
param_grid = {
    'max_depth': [None, 10, 20],      
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]      
}

# Define the range of n_estimators to try
n_estimators_range = [50, 100, 200, 300]  # Expanded range

best_accuracy = 0
best_model = None
best_n_estimators = None
best_params = None

for n_estimators in n_estimators_range:
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42) # Fixed n_estimators for this inner loop

    n_train_samples = X_train.shape[0]  # Get the number of training samples

    halving_cv = HalvingGridSearchCV(
        rf_model, 
        param_grid, 
        cv=5,                
        resource='n_samples',  # Use n_samples as resource now!
        max_resources=int(n_train_samples * 0.5),      # 50% of training data as int
        factor=2,             
        min_resources=int(n_train_samples * 0.1),       # 10% of training data as int
        scoring='accuracy',   
        n_jobs=-1,           
        verbose=0             # Keep it quiet inside the loop
    )

    halving_cv.fit(X_train, y_train)

    best_inner_model = halving_cv.best_estimator_
    y_pred = best_inner_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = best_inner_model
        best_n_estimators = n_estimators
        best_params = halving_cv.best_params_

print("Best n_estimators:", best_n_estimators)
print("Best hyperparameters (other):", best_params)
print("Best Accuracy:", best_accuracy)
print(classification_report(y_test, best_model.predict(X_test)))

# Imoprtant Features

In [None]:
import numpy as np
import matplotlib.pyplot as plt

importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(features.shape[1]), importances[indices], align="center")
plt.xticks(range(features.shape[1]), [features.columns[i] for i in indices], rotation=90)
plt.xlim([-1, features.shape[1]])
plt.show()