# بسم الله الرحمن الرحيم

In [None]:
from utils import *
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt 


2023-05-16 20:26:04.623607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-16 20:26:04.797983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-16 20:26:04.798021: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
df = pd.read_csv("dataset/body_level_classification_train.csv")

In [None]:
df_original= cleaning_data(df).copy()

In [None]:
all_class_train,all_class_test=data_spliting(df_original,1/3)
## shuffling 
all_class_train = all_class_train.sample(frac=1.0, random_state=42)
all_class_test = all_class_test.sample(frac=1.0, random_state=42)

In [None]:
X_train=all_class_train.drop(["Body_Level"],axis=1).to_numpy()
Y_train=all_class_train["Body_Level"].to_numpy()
X_test =all_class_test.drop(["Body_Level"],axis=1).to_numpy()
Y_test =all_class_test["Body_Level"].to_numpy()

In [None]:
class_weights = all_class_train["Body_Level"].value_counts().to_dict()
total_num = sum(class_weights.values())
class_weights= { i : (total_num/(4* j )) for i ,j in zip(class_weights.keys(),class_weights.values())}

In [None]:
X_train, scaler= data_scaling(X_train)


In [None]:
X_val, X_test, Y_val, Y_test = train_test_split(X_test
                                                     , Y_test, test_size=0.5, random_state=42)

In [None]:
X_val = scaler.transform(X_val)
X_test=scaler.transform(X_test)

# Bare Logistic Regression 

In [None]:
bare_reg_model = LogisticRegression()
# concat the train and the val becuase it's the bare one 
bare_reg_model.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))

## accuracy on test

In [None]:
bare_accuracy , repo = test_model(bare_reg_model,X_test,Y_test,axis=False)

In [None]:
print(bare_accuracy)

In [None]:
print(repo)

## applying the hyperpamater tuning 
**In this model we get the best we can get from the Logistic regression by applying the Tuning and get the best hyperparameters**

In [None]:

reg_model = LogisticRegression(random_state=42,max_iter=3000)
# Define the parameter grid to search over
param_grid = {
    'C': [0.001,0.01,0.1, 1, 10,100,200,300,400,500 ,600 ,700 ],
    'penalty': ['l2']
}


grid_search = GridSearchCV(estimator=reg_model, param_grid=param_grid,
                           cv=2,verbose=2)

# concat beacuase we use the K-Fold cross validation 
grid_search.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))



# Access the cv_results_ dictionary
cv_results = grid_search.cv_results_

# Print the mean test scores for each hyperparameter combination
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(params['C'], mean_score)



In [None]:
best_reg_model = grid_search.best_estimator_

## accuracy on test on best reg model 

In [None]:
best_accuracy , repo = test_model(best_reg_model,X_test,Y_test,axis=False)

In [None]:
print(best_accuracy)
print(repo)

## Plotting the change on accuracy tuning the 'C' hyperparameter

In [None]:
# Print the mean test scores for each hyperparameter combination
x_axis = []
y_axis = []
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    x_axis.append(params["C"])
    y_axis.append(mean_score)
    
    
plt.plot(np.arange(1,len(x_axis)+1),y_axis)
plt.title('Changing the hyperparameter C vs accuracy')
plt.xlabel('C')
plt.ylabel('accuracy')
plt.axvline(np.argmax(cv_results["mean_test_score"]),
            color='red',
            label=f"C={cv_results['params'][np.argmax(cv_results['mean_test_score'])]['C']}")
plt.legend()
plt.show()

## Regression model with class weights only 

In [None]:
only_weights_model = LogisticRegression(random_state=42,max_iter=3000
                                        ,class_weight=class_weights)



In [None]:
only_weights_model.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))

In [None]:
only_weights_accuracy , repo = test_model(only_weights_model,X_test,Y_test,axis=False)

In [None]:
print(only_weights_accuracy)
print(repo)

## Regression model with best C only


In [None]:
only_C_model = LogisticRegression(random_state=42,max_iter=3000
                                        ,C=700)


In [None]:
only_C_model.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))

In [None]:
only_C_accuracy , repo = test_model(only_C_model,X_test,Y_test,axis=False)

In [None]:
print(only_C_accuracy)
print(repo)

**As shown from above analysis , it's obvious that the Tuning the Parameter C alone , or providing class_weights alone is much better that apply both of them**

# oversampling 

In [None]:
oversampled= over_sampling(all_class_train,sampling_ratio=0.5)

In [None]:
class_weights = oversampled["Body_Level"].value_counts().to_dict()
total_num = sum(class_weights.values())
class_weights= { i : (total_num/(4* j )) for i ,j in zip(class_weights.keys(),class_weights.values())}

In [None]:
class_weights

In [None]:
X_train=oversampled.drop(["Body_Level"],axis=1).to_numpy()
Y_train=oversampled["Body_Level"].to_numpy()
X_test =all_class_test.drop(["Body_Level"],axis=1).to_numpy()
Y_test =all_class_test["Body_Level"].to_numpy()

In [None]:
X_train, scaler= data_scaling(X_train)
X_val, X_test, Y_val, Y_test = train_test_split(X_test
                                                     , Y_test, test_size=0.5, random_state=42)
X_val = scaler.transform(X_val)
X_test=scaler.transform(X_test)

## Applying the bare Regression on Oversampled data 

In [None]:
bare_oversampled_reg = LogisticRegression(max_iter=1000)
bare_oversampled_reg.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))

In [None]:
bare_oversampled_accuracy , repo = test_model(bare_oversampled_reg,X_test,Y_test,axis=False)

In [None]:
print(bare_oversampled_accuracy)
print(repo)

## Applying Regression On Over sampled data Class_weights only

In [None]:
oversampled_class_weights = LogisticRegression(max_iter=1000,class_weight=class_weights)
oversampled_class_weights.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))

In [None]:
oversampled_class_weights_accuracy , repo = test_model(oversampled_class_weights,X_test,
                                                       Y_test,axis=False)

In [None]:
print(oversampled_class_weights_accuracy)
print(repo)

## Tuning the C value on oversampled data  

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score



reg_model = LogisticRegression(random_state=42,max_iter=3000)
# Define the parameter grid to search over
param_grid = {
    'C': [0.001,0.01,0.1, 1, 10,100,200,300,400,500 ,600 ,700,800,900,1000],
    'penalty': ['l2']
}


grid_search = GridSearchCV(estimator=reg_model, param_grid=param_grid, cv=5 , verbose=1 )

grid_search.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))



# Access the cv_results_ dictionary
cv_results = grid_search.cv_results_

# Print the mean test scores for each hyperparameter combination
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    print(params, mean_score)



In [None]:
Oversampled_C_only_accuracy , repo = test_model(grid_search.best_estimator_,
                                                X_test,Y_test,axis=False)

In [None]:
print(Oversampled_C_only_accuracy)
print(repo)

## Plot the different values of C 

In [None]:
x_axis = []
y_axis = []
for mean_score, params in zip(cv_results['mean_test_score'], cv_results['params']):
    x_axis.append(params["C"])
    y_axis.append(mean_score)
    
    
plt.plot(np.arange(1,len(x_axis)+1),y_axis)
plt.title('Changing the hyperparameter C vs accuracy')
plt.xlabel('C')
plt.ylabel('accuracy')
plt.axvline(np.argmax(cv_results["mean_test_score"]),
            color='red',
            label=f"C={cv_results['params'][np.argmax(cv_results['mean_test_score'])]['C']}")
plt.legend()
plt.show()

## Logistic Regression on best C and class weights on Oversampled data

In [None]:
best_oversampled_class_weights = LogisticRegression(max_iter=1000,class_weight=class_weights,C=600)
best_oversampled_class_weights.fit(np.concatenate([X_train,X_val], axis=0)
                   ,np.concatenate([Y_train,Y_val],axis=0))

In [None]:
best_oversampled_class_weights_accuracy , repo = test_model(best_oversampled_class_weights,
                                                            X_test,Y_test,axis=False)

In [None]:
print(best_oversampled_class_weights_accuracy)
print(repo)