In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset_path = r"F:\4th year 2nd semester\Thesis\Dataset\nutritional_info_first_1000_rows.csv"
df = pd.read_csv(dataset_path)

# Check for non-numeric values in input columns excluding 'remarks'
non_numeric_cols = df.drop('remarks', axis=1).select_dtypes(exclude=['number']).columns
if not non_numeric_cols.empty:
    # If 'id' and 'title' are not relevant, drop them
    df = df.drop([ 'title'], axis=1)
    print(f"Non-numeric columns dropped: {non_numeric_cols}")

# Convert string labels to numerical labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['remarks'] = le.fit_transform(df['remarks'])

# Split the data into input features (X) and output (y)
X = df.drop('remarks', axis=2)
y = df['remarks']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model
model = KNeighborsClassifier()

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model on the training set with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Convert predictions back to original labels for evaluation (if needed)
y_pred_original = le.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

# Print classification report for more detailed evaluation
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_original))

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Non-numeric columns dropped: Index(['title'], dtype='object')


ValueError: No axis named 2 for object type DataFrame

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset_path = r"F:\4th year 2nd semester\Thesis\Dataset\nutritional_info_first_1000_rows.csv"
df = pd.read_csv(dataset_path)

# Check for non-numeric values in input columns excluding 'remarks'
non_numeric_cols = df.drop('remarks', axis=1).select_dtypes(exclude=['number']).columns
if not non_numeric_cols.empty:
    # If 'id' and 'title' are not relevant, drop them
    df = df.drop(['title'], axis=1)
    print(f"Non-numeric columns dropped: {non_numeric_cols}")

# Convert string labels to numerical labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['remarks'] = le.fit_transform(df['remarks'])

# Split the data into input features (X) and output (y)
X = df.drop('remarks', axis=1)
y = df['remarks']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model
model = KNeighborsClassifier()

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model on the training set with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Convert predictions back to original labels for evaluation (if needed)
y_pred_original = le.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

# Print classification report for more detailed evaluation
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_original))

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Non-numeric columns dropped: Index(['title'], dtype='object')
Best Parameters: {'n_neighbors': 5}
Accuracy on Test Set: 0.92
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.92      0.88      0.90        86
           1       0.91      0.94      0.93       114

    accuracy                           0.92       200
   macro avg       0.92      0.91      0.91       200
weighted avg       0.92      0.92      0.91       200

Confusion Matrix:
 [[ 76  10]
 [  7 107]]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset_path = r"F:\4th year 2nd semester\Thesis\Dataset\nutritional_info_first_1000_rows.csv"
df = pd.read_csv(dataset_path)

# Check for non-numeric values in input columns excluding 'remarks'
non_numeric_cols = df.drop('remarks', axis=1).select_dtypes(exclude=['number']).columns
if not non_numeric_cols.empty:
    # If 'id' and 'title' are not relevant, drop them
    df = df.drop(['title'], axis=1)
    print(f"Non-numeric columns dropped: {non_numeric_cols}")

# Convert string labels to numerical labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['remarks'] = le.fit_transform(df['remarks'])

# Split the data into input features (X) and output (y)
X = df.drop('remarks', axis=1)
y = df['remarks']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model
model = KNeighborsClassifier()

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model on the training set with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Convert predictions back to original labels for evaluation (if needed)
y_pred_original = le.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

# Print classification report for more detailed evaluation
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_original))

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Non-numeric columns dropped: Index(['title'], dtype='object')


NameError: name 'cross_val_score' is not defined

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset_path = r"F:\4th year 2nd semester\Thesis\Dataset\nutritional_info_first_1000_rows.csv"
df = pd.read_csv(dataset_path)

# Check for non-numeric values in input columns excluding 'remarks'
non_numeric_cols = df.drop('remarks', axis=1).select_dtypes(exclude=['number']).columns
if not non_numeric_cols.empty:
    # If 'id' and 'title' are not relevant, drop them
    df = df.drop(['title'], axis=1)
    print(f"Non-numeric columns dropped: {non_numeric_cols}")

# Convert string labels to numerical labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['remarks'] = le.fit_transform(df['remarks'])

# Split the data into input features (X) and output (y)
X = df.drop('remarks', axis=1)
y = df['remarks']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model
model = KNeighborsClassifier()

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model on the training set with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Convert predictions back to original labels for evaluation (if needed)
y_pred_original = le.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

# Print classification report for more detailed evaluation
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_original))

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Non-numeric columns dropped: Index(['title'], dtype='object')
Cross-Validation Scores: [0.835 0.87  0.865 0.89  0.85 ]
Mean Accuracy: 0.8620000000000001
Best Parameters: {'n_neighbors': 5}
Accuracy on Test Set: 0.92
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.92      0.88      0.90        86
           1       0.91      0.94      0.93       114

    accuracy                           0.92       200
   macro avg       0.92      0.91      0.91       200
weighted avg       0.92      0.92      0.91       200

Confusion Matrix:
 [[ 76  10]
 [  7 107]]


In [8]:
# Calculate specificity, sensitivity, FDR, and FOR
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
fdr = fp / (fp + tp)
for_ = fn / (fn + tn)

print(f"Specificity: {specificity:.2f}")
print(f"Sensitivity: {sensitivity:.2f}")
print(f"False Discovery Rate (FDR): {fdr:.2f}")
print(f"False Omission Rate (FOR): {for_:.2f}")

Specificity: 0.88
Sensitivity: 0.94
False Discovery Rate (FDR): 0.09
False Omission Rate (FOR): 0.08


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Load the dataset
dataset_path = r"F:\4th year 2nd semester\Thesis\Dataset\nutritional_info_first_1000_rows.csv"
df = pd.read_csv(dataset_path)

# Check for non-numeric values in input columns excluding 'remarks'
non_numeric_cols = df.drop('remarks', axis=1).select_dtypes(exclude=['number']).columns
if not non_numeric_cols.empty:
    # If 'id' and 'title' are not relevant, drop them
    df = df.drop(['title'], axis=1)
    print(f"Non-numeric columns dropped: {non_numeric_cols}")

# Convert string labels to numerical labels using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['remarks'] = le.fit_transform(df['remarks'])

# Split the data into input features (X) and output (y)
X = df.drop('remarks', axis=1)
y = df['remarks']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN model
model = KNeighborsClassifier()

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model on the training set with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Convert predictions back to original labels for evaluation (if needed)
y_pred_original = le.inverse_transform(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.2f}")

# Print classification report for more detailed evaluation
print("Classification Report on Test Set:\n", classification_report(y_test, y_pred_original))

# Obtain confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Calculate specificity, sensitivity, FDR, and FOR
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
fdr = fp / (fp + tp)
for_ = fn / (fn + tn)

print(f"Specificity: {specificity:.2f}")
print(f"Sensitivity: {sensitivity:.2f}")
print(f"False Discovery Rate (FDR): {fdr:.2f}")
print(f"False Omission Rate (FOR): {for_:.2f}")


Non-numeric columns dropped: Index(['title'], dtype='object')
Cross-Validation Scores: [0.835 0.87  0.865 0.89  0.85 ]
Mean Accuracy: 0.8620000000000001
Best Parameters: {'n_neighbors': 5}
Accuracy on Test Set: 0.92
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       0.92      0.88      0.90        86
           1       0.91      0.94      0.93       114

    accuracy                           0.92       200
   macro avg       0.92      0.91      0.91       200
weighted avg       0.92      0.92      0.91       200

Confusion Matrix:
 [[ 76  10]
 [  7 107]]
Specificity: 0.88
Sensitivity: 0.94
False Discovery Rate (FDR): 0.09
False Omission Rate (FOR): 0.08
