In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
historic_df = pd.read_csv("/content/historic.csv")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:

numeric_features = ['stars']
categorical_features = ['category', 'main_promotion', 'color']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

X = historic_df.drop('success_indicator', axis=1)
y = historic_df['success_indicator']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print(classification_report(y_test, y_pred))

Accuracy: 0.81875
              precision    recall  f1-score   support

        flop       0.78      0.69      0.73       571
         top       0.84      0.89      0.86      1029

    accuracy                           0.82      1600
   macro avg       0.81      0.79      0.80      1600
weighted avg       0.82      0.82      0.82      1600



In [None]:
from sklearn.ensemble import IsolationForest


### Doing all while revmoving main_promotion colums

In [None]:
categorical_cols = ['category', 'color']

In [None]:
df = historic_df[['category', 'color', 'stars','success_indicator']]

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, ['stars'])
    ])

In [None]:
clf = RandomForestClassifier()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', clf)])

In [None]:
X = df.drop('success_indicator', axis=1)
y = df['success_indicator']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train , y_train)

In [None]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.819375


In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Logistic Regression': LogisticRegression()
}


In [None]:
pipelines = {}
for clf_name, clf in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', clf)])
    pipelines[clf_name] = pipeline


In [None]:
for clf_name, pipeline in pipelines.items():
    print(f"Training and evaluating {clf_name}...")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    print(classification_report(y_test, y_pred))
    print("----------------------------------------")


Training and evaluating SVM...
Accuracy: 0.840625
              precision    recall  f1-score   support

        flop       0.82      0.71      0.76       571
         top       0.85      0.91      0.88      1029

    accuracy                           0.84      1600
   macro avg       0.83      0.81      0.82      1600
weighted avg       0.84      0.84      0.84      1600

----------------------------------------
Training and evaluating KNN...
Accuracy: 0.82375
              precision    recall  f1-score   support

        flop       0.79      0.68      0.73       571
         top       0.84      0.90      0.87      1029

    accuracy                           0.82      1600
   macro avg       0.82      0.79      0.80      1600
weighted avg       0.82      0.82      0.82      1600

----------------------------------------
Training and evaluating Gradient Boosting...
Accuracy: 0.81875
              precision    recall  f1-score   support

        flop       0.79      0.67      0.73    

1. SVM (Support Vector Machine):

SVM obtained an 84.06% accuracy rate.
With precision, recall, and F1-scores of 0.82 for "flop" and 0.85 for "top," it showed balanced performance.

2. KNN (K-Nearest Neighbors):

KNN obtained an 82.38% accuracy rate.
In comparison to SVM, it demonstrated marginally inferior recall, precision, and F1-scores, especially for the 'flop' class.

3. Gradient Boosting:

With gradient boosting, 81.88% accuracy was attained.
It performed worse than SVM and KNN for the 'flop' class, even if it had a strong recall for the 'top' class.

4. AdaBoost:

AdaBoost's accuracy rate was 81.06%.
While slightly worse than SVM and KNN, it demonstrated balanced performance with comparable precision, recall, and F1-scores for both classes.

5. Logistic Regression:
The accuracy of Logistic Regression was 81.63%.
It showed balanced performance, similar to AdaBoost, but with significantly lower recall and precision for the 'flip' class.

With the greatest accuracy and balanced performance across precision, recall, and F1-scores for both classes, SVM outperformed the other classifiers in the evaluation. KNN performed competitively as well, albeit little worse than SVM. In comparison to SVM and KNN, Gradient Boosting, AdaBoost, and Logistic Regression showed somewhat poorer accuracy and performance, with differing degrees of class imbalance in F1-scores, precision, and recall.