## Initial logisctic stat model test

In [None]:
X = codon_df_clean.drop(columns=['Kingdom', 'SpeciesName'])
y= codon_df_clean['Kingdom']

In [None]:
# Correlation heatmap between independent variables (your X)
plt.figure(figsize=(20,10))
mask = np.triu(X.corr())
sns.heatmap(data= X.corr(), annot= False, cmap= "coolwarm", mask= mask, vmax=1, vmin=-1)
plt.tight_layout()
plt.show()

In [None]:
X_modelling = sm.add_constant(X)
X_test = X_modelling[['const','UUU']]

In [None]:
model_1 = sm.MNLogit(y,X_test).fit()

model_1.summary()

In [None]:
## ML logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

y_1 = codon_df_clean['Kingdom_mapped']
X_1= codon_df_clean.drop(columns=['Kingdom', 'SpeciesName', 'Kingdom_mapped', 'DNAtype_mapped'])
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.3, random_state=17)

X_train.shape
y_train.shape
my_LR_scaler = StandardScaler()

my_LR_scaler.fit(X_train)

X_train_scaled = my_LR_scaler.transform(X_train)
X_test_scaled = my_LR_scaler.transform(X_test)
c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
train_scores = []
test_scores = []

for c in c_list:
    log_model_preview = LogisticRegression(C=c, random_state=7, max_iter=1000, n_jobs=4).fit(X_train_scaled, y_train)
    
    train_scores.append(log_model_preview.score(X_train_scaled, y_train))

    test_scores.append(log_model_preview.score(X_test_scaled, y_test))
 
pd.DataFrame({'C value': c_list, 'train score': train_scores, 'test score': test_scores})
plt.figure(figsize=(10, 5))
plt.plot(c_list, train_scores, label='Train score', marker='.')
plt.plot(c_list, test_scores, label='Test score', marker='.')
plt.legend()
plt.title('Effect of Varying C on Logistic Regression scores Preview')
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.grid()
plt.show()
log_model_tunned = LogisticRegression(C=1, max_iter=1000, random_state=7)

log_model_tunned.fit(X_train_scaled, y_train)
print(f'Accuracy on train set: {log_model_tunned.score(X_train_scaled, y_train)}')
print(f'Accuracy on test set: {log_model_tunned.score(X_test_scaled, y_test)}')
y_train_pred = log_model_tunned.predict(X_train_scaled)
y_test_pred = log_model_tunned.predict(X_test_scaled)
from sklearn.metrics import classification_report
report_train = classification_report(y_train, y_train_pred)
print(report_train)
report_test = classification_report(y_test, y_test_pred)
print(report_test)

## KNN to be used

In [None]:
## KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
y_2 = codon_df_clean['Kingdom_mapped']
X_2= codon_df_clean.drop(columns=['Kingdom', 'SpeciesName', 'Kingdom_mapped', 'DNAtype_mapped'])
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.3, random_state=17)
knn_scaler = StandardScaler()
knn_scaler.fit(X_train)
X_train_scaler_knn = knn_scaler.transform(X_train)
X_test_scaler_knn = knn_scaler.transform(X_test)
k_list= range(1, 20)
train_scores = []
test_scores = []

for k in k_list:
    my_knn_model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    my_knn_model.fit(X_train_scaler_knn, y_train)
    train_scores.append(my_knn_model.score(X_train_scaler_knn, y_train))
    test_scores.append(my_knn_model.score(X_test_scaler_knn, y_test))
plt.figure()
plt.plot(k_list, train_scores, color='blue', label='Train Score')
plt.plot(k_list, test_scores, color='red', label='Test Score')
plt.xlabel('Value of k hyperparameter')
plt.ylabel('Accuracy Score')
plt.title('with scaling')
plt.legend()
plt.show()
knn_model_tunned = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_model_tunned.fit(X_train_scaler_knn, y_train)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_test = knn_model_tunned.predict(X_test_scaler_knn)
y_pred_train = knn_model_tunned.predict(X_train_scaler_knn)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
from sklearn.metrics import classification_report
report_train = classification_report(y_train, y_pred_train)
print(report_train)
report_test = classification_report(y_test, y_pred_test)
print(report_test)
cm_train = confusion_matrix(y_train, y_pred_train)
ConfusionMatrixDisplay.from_estimator(knn_model_tunned, X_train_scaler_knn, y_train, normalize='true')
cm = confusion_matrix(y_train, y_pred_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()