# **Logistic Regression**

- Logistic Regression is a Classification Algorithm  
- It is used to predict the binary outcome (1/0, Yes/No, True/False) given a set of independent  variables.
- To represent binary/categorical outcome, we use dummy variables. 

In [10]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import skim
from sklearn.linear_model import LogisticRegression

# import train-test split 
from sklearn.model_selection import train_test_split

In [7]:
# Load the diamonds dataset 
df = sns.load_dataset('diamonds')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [11]:
# apply the skimpy on df
skim(df)

┌────────────────────────────── skimpy summary ───────────────────────────────┐
│ [3m         Data Summary         [0m [3m      Data Types       [0m                      │
│ ┌───────────────────┬────────┐ ┌─────────────┬───────┐                      │
│ │[1;36m [0m[1;36mdataframe        [0m[1;36m [0m│[1;36m [0m[1;36mValues[0m[1;36m [0m│ │[1;36m [0m[1;36mColumn Type[0m[1;36m [0m│[1;36m [0m[1;36mCount[0m[1;36m [0m│                      │
│ ├───────────────────┼────────┤ ├─────────────┼───────┤                      │
│ │ Number of rows    │ 53940  │ │ float64     │ 6     │                      │
│ │ Number of columns │ 10     │ │ category    │ 3     │                      │
│ └───────────────────┴────────┘ │ int32       │ 1     │                      │
│                                └─────────────┴───────┘                      │
│ [3m       Categories        [0m                                                   │
│ ┌───────────────────────┐                 

In [12]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Logisitic Regression on 'cut'

In [18]:
# Splitting the data into featurs(X) and labels(y)
X = df.drop('cut', axis=1)
y = df['cut']

# label encode the features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# X['cut']=le.fit_transform(X['cut'])
X['color'] = le.fit_transform(X['color'])
X['clarity'] = le.fit_transform(X['clarity'])

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predicting the Test set results
y_pred = model.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Evaluating the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[ 169   25   37   79   25]
 [  38  128  255  268  315]
 [   0    3 3752  283  254]
 [   2    3  302 2023  445]
 [   4   53  873  422 1030]]
0.6583240637745643
              precision    recall  f1-score   support

        Fair       0.79      0.50      0.62       335
        Good       0.60      0.13      0.21      1004
       Ideal       0.72      0.87      0.79      4292
     Premium       0.66      0.73      0.69      2775
   Very Good       0.50      0.43      0.46      2382

    accuracy                           0.66     10788
   macro avg       0.65      0.53      0.55     10788
weighted avg       0.65      0.66      0.63     10788



## Tasks:
1. Convert Price into binary data 
2. train test split the data 
3. train the model 
4. check the model with 5 important metrics of classification

## Logisitic Regression on 'price'

In [29]:
# convert price into binary data 

# Define the threshold value
threshold = df['price'].median()

# Convert price to binary
df['binary_price'] = df['price'].apply(lambda x:1 if x> threshold else 0) 

In [30]:
df.binary_price.unique()

array([0, 1], dtype=int64)

In [31]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,binary_price
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0


In [32]:
# Splitting the data into featurs(X) and labels(y)
X = df.drop(['price', 'binary_price'], axis=1)
y = df['binary_price']

# label encode the features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['cut']=le.fit_transform(X['cut'])
X['color'] = le.fit_transform(X['color'])
X['clarity'] = le.fit_transform(X['clarity'])

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting the model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Predicting the Test set results
y_pred = model.predict(X_test)

# Evaluating the model with classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Precision Score:', precision_score(y_test, y_pred))
print('Recall Score:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('ROC AUC Score:', roc_auc_score(y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))



Accuracy Score: 0.9630144605116796
Precision Score: 0.9651054301175592
Recall Score: 0.9606240713224369
F1 Score: 0.9628595364423345
ROC AUC Score: 0.9630100371416034
Confusion Matrix: [[5217  187]
 [ 212 5172]]
