In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,ConfusionMatrixDisplay,classification_report
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train = df_train.drop('id', axis=1)

In [None]:
df_train.drop_duplicates()

In [None]:
df_train.isnull().sum()

In [None]:
X = df_train.iloc[:, : -1]
y = df_train.iloc[:, -1]

In [None]:
X

In [None]:
y

In [None]:
cat_col = X.select_dtypes(include=['object']).columns

In [None]:
cat_col

In [None]:
num_col = X.select_dtypes(exclude=['object']).columns

In [None]:
num_col

In [None]:
for col in cat_col:
    print("{} has {} items: ".format(col, len(X[col].unique())), X[col].unique())

In [None]:
label_encoder = LabelEncoder()
encoder = OneHotEncoder()

In [None]:
for col in cat_col:
    X[col] = label_encoder.fit_transform(X[col])

In [None]:
for col in cat_col:
    print("{} has {} items: ".format(col, len(X[col].unique())), X[col].unique())

In [None]:
for col in cat_col:
    df_test[col] = label_encoder.fit_transform(df_test[col])

In [None]:
ordered_rank_features = SelectKBest(score_func=chi2, k=16)
ordered_feature = ordered_rank_features.fit(X,y)

In [None]:
X_scores = pd.DataFrame(ordered_feature.scores_, columns=['Score'])
X_columns = pd.DataFrame(X.columns)

In [None]:
features_rank = pd.concat([X_columns, X_scores], axis=1)

In [None]:
features_rank.columns = ['Features', 'Score']
features_rank

In [None]:
features_rank.nlargest(10, 'Score')

In [None]:
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
ranked_features = pd.Series(model.feature_importances_, index=X.columns)
ranked_features.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
best_features = ranked_features.nlargest(10).index

In [None]:
corr = X.corr()
top_features = corr.index
plt.figure(figsize=(15,10))
sns.heatmap(X[top_features].corr(), annot=True)

In [None]:
threshold = 0.8
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
correlation(X, threshold)

In [None]:
X = X[best_features]

In [None]:
X

In [None]:
df_test = df_test[best_features]

In [None]:
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(X)

In [None]:
print(X_scaled)

In [None]:
df_test_scaled = scaler.fit_transform(df_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [None]:
X_train

In [None]:
rf=RandomForestClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100)
#rf_parameters = {
   # 'n_estimators': [50, 100, 150],
   # 'max_depth': [None, 10, 20, 30],
   # 'min_samples_split': [2, 5, 10],
   # 'min_samples_leaf': [1, 2, 4]
#}
#rf_grid_search = GridSearchCV(rf, rf_parameters, cv=5, n_jobs=-1,verbose=1, scoring='accuracy')
#rf_grid_search.fit(X_train, y_train)
#print ('Best score: %0.3f' % rf_grid_search.best_score_)
#print ('Best parameters set:')
#rf_best_parameters = rf_grid_search.best_estimator_.get_params()
#for param_name in sorted(rf_parameters.keys()):
   # print ('\t%s: %r' % (param_name, rf_best_parameters[param_name]))
#best_rf = rf_grid_search.best_estimator_
rf.fit(X_train, y_train)
#rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
#y_pred_rf = rf.predict(X_test)
print(f"Accuracy:{accuracy_score(y_test, y_pred_rf)}")


In [None]:
dt = DecisionTreeClassifier(max_depth=150, min_samples_leaf=3, min_samples_split=2, criterion='entropy')
#dt_parameters = {
   # 'max_depth': (150, 155, 160),
   # 'min_samples_split': ( 2, 3),
    #'min_samples_leaf': (1, 2, 3),
    #'criterion' :['gini', 'entropy']}
#dt_grid_search = GridSearchCV(dt, dt_parameters, cv=5, n_jobs=-1,verbose=1, scoring='accuracy')
#dt_grid_search.fit(X_train, y_train)
#print ('Best score: %0.3f' % dt_grid_search.best_score_)
#print ('Best parameters set:')
#dt_best_parameters = dt_grid_search.best_estimator_.get_params()
#for param_name in sorted(dt_parameters.keys()):
    #print ('\t%s: %r' % (param_name, dt_best_parameters[param_name]))
#best_dt = dt_grid_search.best_estimator_
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print(f"Accuracy:{accuracy_score(y_test, y_pred_dt)}")

In [None]:
gradient = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, n_estimators=150)
#gradient_parameters = {
    #'learning_rate': [0.1, 0.05],
    #'n_estimators': [50, 100, 150],
    #'max_depth': [3, 4, 5] }
#gradient_grid_search = GridSearchCV(gradient, gradient_parameters, cv=5, n_jobs=-1,verbose=1, scoring='accuracy')
#gradient_grid_search.fit(X_train, y_train)
#print ('Best score: %0.3f' % gradient_grid_search.best_score_)
#print ('Best parameters set:')
#best_parameters = gradient_grid_search.best_estimator_.get_params()
#for param_name in sorted(gradient_parameters.keys()):
   # print ('\t%s: %r' % (param_name, best_parameters[param_name]))
#best_gradient = gradient_grid_search.best_estimator_
gradient.fit(X_train, y_train)
y_pred_gradient = gradient.predict(X_test)
print(f"Accuracy:{accuracy_score(y_test, y_pred_gradient)}")


In [None]:
best_params = {
    "objective": "multiclass",          # Objective function for the model
    "metric": "multi_logloss",          # Evaluation metric
    "verbosity": -1,                    # Verbosity level (-1 for silent)
    "boosting_type": "gbdt",            # Gradient boosting type
    "random_state": 42,       # Random state for reproducibility
    "num_class": 7,                     # Number of classes in the dataset
    'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting
    'n_estimators': 500,                # Number of boosting iterations
    'lambda_l1': 0.009667446568254372,  # L1 regularization term
    'lambda_l2': 0.04018641437301800,   # L2 regularization term
    'max_depth': 10,                    # Maximum depth of the trees
    'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree
    'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration
    'min_child_samples': 26             # Minimum number of data needed in a leaf
}
lgbm_classifier = LGBMClassifier(**best_params)

lgbm_classifier.fit(X_train, y_train)

y_pred = lgbm_classifier.predict(X_test)
accuracy_score(y_test, y_pred) 

In [None]:
predictions_rf = rf.predict(df_test_scaled)
predictions_dt = dt.predict(df_test_scaled)
predictions_lgbm = lgbm_classifier.predict(df_test_scaled)
predictions_gradient = gradient.predict(df_test_scaled)
submission = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")
predictions = [predictions_rf, predictions_dt, predictions_lgbm, predictions_gradient]
model_names = ['rf', 'dt', 'lgbm', 'gradient']
for pred, model_name in zip(predictions, model_names):
    submission["NObeyesdad"] = pred
    submission.to_csv(f'{model_name}_predictions.csv', index=False)
#submission.head()