# Import library

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr

import sklearn
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


# Load Datasets

In [2]:
# Please change the file path to the location that where the concrete dataset located.
# Load dataset
# all_df=pd.read_csv("C:\Data_for_UCI_named.csv",index_col=False)
all_df=pd.read_csv("/Users/alden/Desktop/machine_learning/Data_for_UCI_named.csv",index_col=False)
all_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/alden/Desktop/machine_learning/Data_for_UCI_named.csv'

# Dataset Information

In [None]:
all_df.info()

In [None]:
all_df.describe()

In [None]:
# Determine the number for stable and unstable 
all_df['stabf'].value_counts()

In [None]:
sns.countplot(x="stabf", data=all_df)

In [None]:
# Determine is there any outlier in each features
data_mean = all_df.iloc[:, :]
data_mean.plot(kind='box', subplots=True, layout=(8,4), sharex=False,
sharey=False, fontsize=12, figsize=(15,20));

In [None]:
fig,ax=plt.subplots(1,figsize=(20,8))
sns.boxplot(data=all_df.iloc[:, 0:13],ax=ax) 

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(15,20))
fig.subplots_adjust(hspace =.2, wspace=.5)
axes = axes.ravel()
for i, col in enumerate(all_df.columns[:-2]):
    _= sns.boxplot(y=col, x='stabf', data=all_df, ax=axes[i])

# HeatMap

In [None]:
corrMatt = all_df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corrMatt)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20, 12))
plt.title('electric grid Feature Correlation')
# Generate a custom diverging colormap
cmap = sns.diverging_palette(260, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corrMatt, vmax=1.2, square=False, cmap=cmap, mask=mask,
ax=ax, annot=True, fmt='.2g', linewidths=1);

# Data Cleaning and Preprocessing

In [None]:
isnull = all_df.isnull()
print(isnull)

In [None]:
all_df.dropna(inplace = True)
all_df.drop_duplicates(inplace = True)
all_df.shape

In [None]:
def outliers(df, ft):
    Q1 = df[ft].quantile(0.25)
    Q3 = df[ft].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    ls = df.index[(df[ft] < lower_bound) | (df[ft] > upper_bound)]
    
    return ls

In [None]:
def remove(df, ls):
    ls = sorted(set(ls))
    df = df.drop(ls)
    return df

In [None]:
index_list = []
for feature in ['tau1','tau2','tau3','tau4','p1','p2','p3','p4','g1','g2','g3','g4']:
    index_list.extend(outliers(all_df,feature))

In [None]:
all_df_cleaned = remove(all_df,index_list)

In [None]:
all_df_cleaned.shape

In [None]:
X = all_df_cleaned.drop(['stabf','stab'],axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
fig,ax=plt.subplots(1,figsize=(20,8))
sns.boxplot(data=Xs,ax=ax)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
all_df_cleaned['stabf'] = le.fit_transform(all_df_cleaned['stabf'])
all_df.head()
# assign numerical label to y
y = all_df_cleaned['stabf']
y

# Split dataset into test and training test

In [None]:
Xs_train, Xs_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3, 
random_state=1, stratify=y)

# logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1).fit(Xs_train, y_train)

## Classification accuracy without k-fold

In [None]:
classifier_score = clf.score(Xs_test, y_test)
print('The classifier accuracy score is {:03.2f}'.format(classifier_score))

## K-fold Cross Validation For Classification Accuracy

In [None]:
from sklearn.model_selection import cross_val_score
n_folds = 5
cv_error = np.average(cross_val_score(clf, Xs, y, cv=n_folds, scoring='accuracy'))
print('The {}-fold cross-validation accuracy score for this classifier is {:.2f}'.format(n_folds, cv_error))