### 1)Try different thresholds for computing predictions using 'Al' column. By default it is 0.5. Use predict_proba function to compute probabilities and then try custom thresholds and see their impact on Accuracy, Precision and Recall.

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv('glass.csv')


X = data['Al'].values.reshape(-1, 1)  # Reshape 
y = data['Type'].apply(lambda x: 1 if x == 1 else 0) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# logistic regression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

# Predict
y_probs = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Different thresholds
thresholds = [0.4, 0.6, 0.8]
metrics = []

for threshold in thresholds:
    y_pred = (y_probs >= threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    metrics.append((threshold, accuracy, precision, recall))

metrics_df = pd.DataFrame(metrics, columns=['Threshold', 'Accuracy', 'Precision', 'Recall'])
print(metrics_df)

   Threshold  Accuracy  Precision    Recall
0        0.4  0.720930   0.454545  0.454545
1        0.6  0.697674   0.000000  0.000000
2        0.8  0.744186   0.000000  0.000000


  _warn_prf(average, modifier, msg_start, len(result))


### 2)Do the same analysis for other columns

In [4]:
def evaluate_feature(feature_name, data, thresholds=[0.4, 0.6, 0.8]):
    X = data[feature_name].values.reshape(-1, 1)  # Reshapee
    y = data['Type'].apply(lambda x: 1 if x == 1 else 0) 

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #  logistic regression model
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, y_train)

    # Predict probabilities
    y_probs = model.predict_proba(X_test)[:, 1] 

    metrics = []

    # different thresholds
    for threshold in thresholds:
        y_pred = (y_probs >= threshold).astype(int)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        metrics.append((threshold, accuracy, precision, recall))

    # visualization
    metrics_df = pd.DataFrame(metrics, columns=['Threshold', 'Accuracy', 'Precision', 'Recall'])
    print(f"Results for feature: {feature_name}")
    print(metrics_df)
    print("\n")

data = pd.read_csv('glass.csv')

features = ['Na', 'Mg', 'Si', 'K', 'Ca', 'Ba', 'Fe']  # Add other features you want to analyze

for feature in features:
    evaluate_feature(feature, data)


Results for feature: Na
   Threshold  Accuracy  Precision  Recall
0        0.4  0.744186        0.0     0.0
1        0.6  0.744186        0.0     0.0
2        0.8  0.744186        0.0     0.0


Results for feature: Mg
   Threshold  Accuracy  Precision    Recall
0        0.4  0.744186        0.5  0.909091
1        0.6  0.744186        0.0  0.000000
2        0.8  0.744186        0.0  0.000000


Results for feature: Si
   Threshold  Accuracy  Precision  Recall
0        0.4  0.744186        0.0     0.0
1        0.6  0.744186        0.0     0.0
2        0.8  0.744186        0.0     0.0


Results for feature: K
   Threshold  Accuracy  Precision  Recall
0        0.4  0.744186        0.0     0.0
1        0.6  0.744186        0.0     0.0
2        0.8  0.744186        0.0     0.0


Results for feature: Ca
   Threshold  Accuracy  Precision  Recall
0        0.4  0.744186        0.0     0.0
1        0.6  0.744186        0.0     0.0
2        0.8  0.744186        0.0     0.0


Results for feature: Ba

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### 3)Fit a Logistic Regression Model on all features. Remember to preprocess data(eg. normalization and one hot encoding).


In [6]:
data = pd.read_csv('glass.csv')

X = data.drop('Type', axis=1)  
y = data['Type']  

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalize
    ('classifier', LogisticRegression(solver='liblinear', multi_class='ovr'))  
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.56      0.82      0.67        11
           2       0.50      0.57      0.53        14
           3       0.00      0.00      0.00         3
           5       1.00      0.25      0.40         4
           6       0.00      0.00      0.00         3
           7       0.80      1.00      0.89         8

    accuracy                           0.60        43
   macro avg       0.48      0.44      0.41        43
weighted avg       0.55      0.60      0.55        43


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
