In [3]:
#libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Load the dataset
historic_data = pd.read_csv('historic.csv')

# Converting top to 0 and flop to 1
historic_data['success_indicator'] = historic_data['success_indicator'].map({'top': 0, 'flop': 1})

In [5]:
# Defining feature columns
X = historic_data.drop(columns=['success_indicator'])
y = historic_data['success_indicator']

# Spliting for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
#preprocessing steps
numeric_features = ['stars']
categorical_features = ['category', 'main_promotion', 'color']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
# Define parameter grids for RandomForestClassifier and MLPClassifier
param_grids = [
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [100, 200, 300]},
    {'classifier': [MLPClassifier(max_iter=1000)],
     'classifier__hidden_layer_sizes': [(25,), (15, 15), (5, 5, 5)],
     'classifier__activation': ['relu', 'tanh'],
     'classifier__solver': ['adam']}
]

#best model and best score
best_model = None
best_score = 0


In [8]:
# Initialize best model and best score
best_model = None
best_score = 0

for params in param_grids:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', None)])

    grid_search = GridSearchCV(pipeline, param_grid=params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Prediction on test set
    y_pred = grid_search.predict(X_test)

    # Calculating evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print results
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("-----*****-----*****-----*****-----*****-----*****-----*****-----")

    # Update best model and best score if current model is better
    if grid_search.best_score_ > best_score:
        best_model = grid_search.best_estimator_
        best_score = grid_search.best_score_

# Printing best model and best score
print("Best Model:", best_model)
print("Best Score:", best_score)


Best Parameters: {'classifier': RandomForestClassifier(n_estimators=300), 'classifier__n_estimators': 300}
Best Score: 0.8159375000000001
Accuracy: 0.821875
Precision: 0.782608695652174
Recall: 0.6935201401050788
F1 Score: 0.735376044568245
-----*****-----*****-----*****-----*****-----*****-----*****-----
Best Parameters: {'classifier': MLPClassifier(activation='tanh', hidden_layer_sizes=(25,), max_iter=1000), 'classifier__activation': 'tanh', 'classifier__hidden_layer_sizes': (25,), 'classifier__solver': 'adam'}
Best Score: 0.8493749999999999
Accuracy: 0.851875
Precision: 0.8353413654618473
Recall: 0.7285464098073555
F1 Score: 0.7782974742750233
-----*****-----*****-----*****-----*****-----*****-----*****-----
Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
            



# Random Forest is chosen because of ...

1. **High Accuracy**: Random Forest often yields high accuracy compared to other models, making it a reliable choice for various datasets and scenarios.

2. **Ensemble Learning**: Random Forest is an ensemble learning method that combines multiple decision trees to make predictions. This approach helps reduce overfitting and improves generalization performance.

3. **Robustness to Outliers**: Random Forest is less sensitive to outliers compared to distance-based algorithms like k-Nearest Neighbors or linear models. This robustness makes it suitable for datasets with noisy or skewed data points.

4. **Interpretability**: Despite being an ensemble model, Random Forest remains relatively interpretable. It provides feature importances, which can help understand the relative importance of different features in making predictions.
