In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")


In [2]:
# uncomment this lines to use the code in google collab
# from google.colab import drive
# drive.mount('/content/drive/')


Mounted at /content/drive/


In [3]:
# Load the Kickstarter dataset
#df = pd.read_csv('/content/drive/MyDrive/257 Project/kickstarter_data_full.csv'). way to access data from google collab

filename = 'kickstarter_data_full.csv'
df = pd.read_csv(filename)

df.drop(columns=['Unnamed: 0', 'id','name', 'blurb','photo', 'slug', 'disable_communication', 
                 'currency_symbol', 'currency_trailing_code', 'creator', 
                 'location', 'category', 'profile', 'urls', 'source_url', 
                 'friends', 'is_starred', 'is_backing', 'permissions'], 
        inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Data columns (total 49 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   goal                         20632 non-null  float64
 1   pledged                      20632 non-null  float64
 2   state                        20632 non-null  object 
 3   country                      20632 non-null  object 
 4   currency                     20632 non-null  object 
 5   deadline                     20632 non-null  object 
 6   state_changed_at             20632 non-null  object 
 7   created_at                   20632 non-null  object 
 8   launched_at                  20632 non-null  object 
 9   staff_pick                   20632 non-null  bool   
 10  backers_count                20632 non-null  int64  
 11  static_usd_rate              20632 non-null  float64
 12  usd_pledged                  20632 non-null  float64
 13  spotlight       

In [5]:
# find value counts for all columns in the DataFrame
# for col in df.columns:
#     print(f"{col}:")
#     print(df[col].value_counts())

## Updating the Missing Value

In [6]:
for column in df.columns:
    if df[column].dtype == 'object':
        df[column].fillna(df[column].mode()[0], inplace=True)
    elif df[column].dtype.name == 'category':
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        df[column].fillna(df[column].mean(), inplace=True)

## Converting all Data Types to Integer

In [7]:

df['staff_pick'] = df['staff_pick'].astype(int)
df['spotlight'] = df['spotlight'].astype(int)

In [8]:
# Convert categorical features to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['state', 'country', 'currency'])
df = pd.get_dummies(df, columns=['deadline_weekday', 'state_changed_at_weekday', 'created_at_weekday', 'launched_at_weekday'])


In [9]:
# Convert categorical features to numerical using one-hot encoding
# df.info()

In [10]:
df['deadline'] = pd.to_datetime(df['deadline']).astype('int64') * 1e-9
df['state_changed_at'] = pd.to_datetime(df['state_changed_at']).astype('int64') * 1e-9
df['created_at'] = pd.to_datetime(df['created_at']).astype('int64') * 1e-9
df['launched_at'] = pd.to_datetime(df['launched_at']).astype('int64') * 1e-9

df['deadline_yr'] = pd.to_datetime(df['deadline']).dt.year.astype(float)
df['deadline_month'] = pd.to_datetime(df['deadline']).dt.month.astype(float)
df['deadline_day'] = pd.to_datetime(df['deadline']).dt.day.astype(float)
df['deadline_hr'] = pd.to_datetime(df['deadline']).dt.hour.astype(float)

df['state_changed_at_yr'] = pd.to_datetime(df['state_changed_at']).dt.year.astype(float)
df['state_changed_at_month'] = pd.to_datetime(df['state_changed_at']).dt.month.astype(float)
df['state_changed_at_day'] = pd.to_datetime(df['state_changed_at']).dt.day.astype(float)
df['state_changed_at_hr'] = pd.to_datetime(df['state_changed_at']).dt.hour.astype(float)

df['created_at_yr'] = pd.to_datetime(df['created_at']).dt.year.astype(float)
df['created_at_month'] = pd.to_datetime(df['created_at']).dt.month.astype(float)
df['created_at_day'] = pd.to_datetime(df['created_at']).dt.day.astype(float)
df['created_at_hr'] = pd.to_datetime(df['created_at']).dt.hour.astype(float)

df['launched_at_yr'] = pd.to_datetime(df['launched_at']).dt.year.astype(float)
df['launched_at_month'] = pd.to_datetime(df['launched_at']).dt.month.astype(float)
df['launched_at_day'] = pd.to_datetime(df['launched_at']).dt.day.astype(float)
df['launched_at_hr'] = pd.to_datetime(df['launched_at']).dt.hour.astype(float)

In [11]:
df['create_to_launch'] = pd.to_timedelta(df['create_to_launch']).dt.total_seconds() / 86400.0
df['launch_to_deadline'] = pd.to_timedelta(df['launch_to_deadline']).dt.total_seconds() / 86400.0
df['launch_to_state_change'] = pd.to_timedelta(df['launch_to_state_change']).dt.total_seconds() / 86400.0

df['create_to_launch_days'] = df['create_to_launch_days'].astype(float)
df['launch_to_deadline_days'] = df['launch_to_deadline_days'].astype(float)
df['launch_to_state_change_days'] = df['launch_to_state_change_days'].astype(float)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Columns: 109 entries, goal to launched_at_weekday_Wednesday
dtypes: float64(34), int64(8), uint8(67)
memory usage: 7.9 MB


In [13]:
# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Set output variable
y_imbalanced = df["SuccessfulBool"]

# Drop unnecessary columns
X_imbalanced = df.drop(["deadline", "state_changed_at", "created_at", "launched_at", "name_len_clean", "blurb_len_clean", "SuccessfulBool"], axis=1)

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imbalanced, y_imbalanced, test_size=0.2, random_state=42)

In [15]:
# Calculate class distribution
class_distribution = y_imbalanced.value_counts(normalize=True) * 100

# Print class distribution
print("Data distribution:")
print(class_distribution)


Data distribution:
0    70.831718
1    29.168282
Name: SuccessfulBool, dtype: float64


In [16]:
# Apply undersampling to the majority class
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)

# Apply SMOTE to the minority class
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X, y = smote.fit_resample(X_train_undersampled, y_train_undersampled)


In [17]:
y_imbalanced

0        0
1        0
2        0
3        0
4        0
        ..
20627    0
20628    0
20629    0
20630    0
20631    0
Name: SuccessfulBool, Length: 20632, dtype: int64

In [18]:

# Calculate class distribution
class_distribution = y.value_counts(normalize=True) * 100

# Print class distribution
print("Data distribution:")
print(class_distribution)

Data distribution:
0    50.0
1    50.0
Name: SuccessfulBool, dtype: float64


In [19]:
# Assuming the DataFrame is named 'df'
column_names = X.columns.tolist()

print("Column names:")
for column in column_names:
    print(column)

Column names:
goal
pledged
staff_pick
backers_count
static_usd_rate
usd_pledged
spotlight
name_len
blurb_len
deadline_month
deadline_day
deadline_yr
deadline_hr
state_changed_at_month
state_changed_at_day
state_changed_at_yr
state_changed_at_hr
created_at_month
created_at_day
created_at_yr
created_at_hr
launched_at_month
launched_at_day
launched_at_yr
launched_at_hr
create_to_launch
launch_to_deadline
launch_to_state_change
create_to_launch_days
launch_to_deadline_days
launch_to_state_change_days
USorGB
TOPCOUNTRY
LaunchedTuesday
DeadlineWeekend
state_canceled
state_failed
state_live
state_successful
state_suspended
country_AT
country_AU
country_BE
country_CA
country_CH
country_DE
country_DK
country_ES
country_FR
country_GB
country_HK
country_IE
country_IT
country_LU
country_MX
country_NL
country_NO
country_NZ
country_SE
country_SG
country_US
currency_AUD
currency_CAD
currency_CHF
currency_DKK
currency_EUR
currency_GBP
currency_HKD
currency_MXN
currency_NOK
currency_NZD
currency_SEK
cu

In [21]:
#selected_features
X_numpy = X.values

## AdaBoost Classifier

In [22]:
# Initialize the AdaBoost classifier

ada_clf = AdaBoostClassifier()

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    ada_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = ada_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 score: 100.00%


## Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
log_reg_clf = LogisticRegression(random_state=42)

# Train the Logistic Regression classifier
log_reg_clf.fit(X_train, y_train)

k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(log_reg_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))



Cross-validation scores: [0.94634019 0.94503777 0.94762897 0.94971339 0.95909328]
Mean cross-validation score: 0.9495627213151115


In [24]:
# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    log_reg_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = log_reg_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 94.96%
Precision: 94.08%
Recall: 95.96%
F1 score: 95.01%


## KNeighborsClassifier

In [25]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the K-NN classifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

# Train the K-NN classifier
k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(knn_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [0.99192498 0.99036207 0.98801459 0.9890568  0.99088067]
Mean cross-validation score: 0.9900478224568046


In [26]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the Logistic Regression classifier
knn_clf = KNeighborsClassifier()

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    knn_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = knn_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the final evaluation metric values

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 99.00%
Precision: 98.12%
Recall: 99.93%
F1 score: 99.01%


## DecisionTreeClassifier

In [27]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree classifier
tree_clf.fit(X_train, y_train)


# Train the K-NN classifier
k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(tree_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0


In [28]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
tree_clf = DecisionTreeClassifier()

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    tree_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = tree_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 score: 100.00%


## RandomForestClassifier

In [29]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
forest_clf = RandomForestClassifier(random_state=42)

# Train the Random Forest classifier
forest_clf.fit(X_train, y_train)



# Train the K-NN classifier
k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(forest_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0


In [30]:

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    forest_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = forest_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 score: 100.00%


## GaussianNB

In [31]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
nb_clf = GaussianNB()

# Train the Naive Bayes classifier
nb_clf.fit(X_train, y_train)

# Train the K-NN classifier
k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(nb_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [0.65746288 0.63714509 0.65033872 0.64226159 0.63861386]
Mean cross-validation score: 0.645164428974944


In [32]:

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    nb_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = nb_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 64.52%
Precision: 95.04%
Recall: 30.63%
F1 score: 46.31%


## Gradient Boosting Classifier

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Train the Gradient Boosting classifier
gb_clf.fit(X_train, y_train)

# Train the K-NN classifier
k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(gb_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0


In [34]:

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    gb_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = gb_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 score: 100.00%


## MLP Classifier

In [35]:
from sklearn.neural_network import MLPClassifier

# Initialize the Neural Network classifier
nn_clf = MLPClassifier(random_state=42)

# Train the Neural Network classifier
nn_clf.fit(X_train, y_train)

# Train the K-NN classifier
k = 5
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(nn_clf, X, y, cv=kf)

print('Cross-validation scores:', scores)
print('Mean cross-validation score:', np.mean(scores))

Cross-validation scores: [0.9713467  0.97551446 0.97498697 0.93955185 0.96143825]
Mean cross-validation score: 0.9645676466304449


In [36]:

# Define the number of folds for cross-validation
k = 5

# Create a StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create lists to store the evaluation metric values for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Perform stratified k-fold cross-validation
for train_index, test_index in skf.split(X_numpy, y):
    X_train, X_test = X_numpy[train_index], X_numpy[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the logistic regression model
    nn_clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = nn_clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append evaluation metric values to the respective lists
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate the mean of the evaluation metric values across all folds
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)

# Print the results
print("Accuracy: {:.2f}%".format(mean_accuracy * 100))
print("Precision: {:.2f}%".format(mean_precision * 100))
print("Recall: {:.2f}%".format(mean_recall * 100))
print("F1 score: {:.2f}%".format(mean_f1 * 100))

Accuracy: 96.46%
Precision: 98.33%
Recall: 94.54%
F1 score: 96.36%
