# DATA EXPLORATION

In [4]:
# Data Collection, Data Cleaning & Data Manipulation 
import numpy as np 
import pandas as pd 
from sklearn import datasets 

# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale = 1)

# Data Transformation
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy import stats

# Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn import set_config
set_config(display='diagram')

# Models Building 
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# Classification Problems
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn. ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

# Regression Problems
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.exceptions import ConvergenceWarning

# Explainbale AI (XAI)
# !pip install lime
# import lime.lime_tabular
# !pip install shap
# import shap

# Unsupervised Learning: Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, Birch, MeanShift, SpectralClustering
from sklearn.metrics import adjusted_rand_score

In [5]:
import plotly.graph_objects as go

In [6]:
# pip install -U dataprep

In [7]:
# !pip install pycaret[full]

In [8]:
# from dataprep.eda import *
# from pycaret.classification import *

In [9]:
data = pd.read_csv('./aw_fb_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../input/apple-watch-and-fitbit-data/aw_fb_data.csv'

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe().transpose()

In [None]:
for col in data.select_dtypes('object'):
    plt.figure()
    data[col].value_counts().plot.pie()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
sns.countplot(ax=axs[0],x='age', hue='activity', data=data)
sns.countplot(ax=axs[1],x='gender', hue='activity', data=data)

In [None]:
sns.heatmap(pd.crosstab(data["activity"],data["device"]),annot=True,fmt='d')

In [None]:
# plt.rcParams.update({'font.size': 10})

sns.set(font_scale = 1)

data.hist(bins = 20, color = 'orange', figsize = (20, 14))

In [None]:
data.activity.value_counts()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(), annot=True, center=0, linewidths=.5, fmt='.2f', vmin=-1, vmax=1, cmap='vlag')

In [None]:
data[data.activity == 'Lying'].head()

In [None]:
data[data.activity == 'Running 7 METs'].head()

In [None]:
data.device.value_counts()

# TRAIN-TEST SPLIT

In [None]:
# df_aw = data[data['device']=='apple watch']

# df_aw=df_aw.drop('device', axis=1)

In [None]:
np.random.seed(42)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 42)

# DATA CLEANING

In [None]:
train_set.columns

In [None]:
# df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [None]:
# df.columns

In [None]:
train_set.isnull().sum()

In [None]:
train_set.duplicated().sum()

# DATA ANALYSIS

In [None]:
train_set.columns

In [None]:
categorical_df = train_set.select_dtypes(include = 'object')

categorical_df.info()

In [None]:
for col in categorical_df.columns:
    print(f'{col}: {categorical_df[col].nunique()}')
    print('\n')

In [None]:
labels = ['Lying','Running 7 METs','Running 5 METs','Running 3 METs', 'Sitting', 'Self Pace walk']
values = train_set['activity'].value_counts()
colors = ['red', 'royalblue','green','yellow','pink','grey']
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5)])
fig.update_traces(hoverinfo='label+value',textfont_size=15,marker=dict(colors=colors))
fig.update_layout(annotations=[dict(text='6 types of Activity', 
                                    x=0.50, y=0.5, font_size=15, 
                                    showarrow=False)])
fig.show()

In [None]:
num_df = train_set.select_dtypes(include = 'number')

In [None]:
plt.style.use('seaborn')

names = list(num_df.columns)

plot_per_row = 2

f, axes = plt.subplots(round(len(names)/plot_per_row), plot_per_row, figsize = (15, 25))

y = 0;

for name in names:
    i, j = divmod(y, plot_per_row)
    sns.histplot(x=num_df[name], kde = True, ax=axes[i, j], color = 'purple')
    y = y + 1

plt.tight_layout()
plt.show()

In [None]:
plt.style.use('seaborn')

names = list(num_df.columns)

plot_per_row = 2

f, axes = plt.subplots(round(len(names)/plot_per_row), plot_per_row, figsize = (15, 25))

y = 0;

for name in names:
    i, j = divmod(y, plot_per_row)
    sns.boxplot(x=num_df[name], ax=axes[i, j], palette = 'Set3')
    y = y + 1

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (20, 14))

corr_matrix = num_df.corr()

g = sns.heatmap(
    corr_matrix,
    annot = True,
    cmap='magma',
)

g.set_xticklabels(g.get_xticklabels(), rotation=25, horizontalalignment='right')

plt.title('Correlation between numerical features')
plt.show()

In [None]:
df_a = data[data['device']=='apple watch'].copy()
df_a.reset_index(drop=True,inplace=True)

In [None]:
plt.figure(figsize=(15,10))
sns.regplot(data = df_a, x='steps', y='distance')
plt.title('Distance Vs. Steps')
plt.tight_layout()

# FEATURES-TARGET SPLIT

In [None]:
X_train = train_set.copy()
y_train = X_train.pop("activity")

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)

In [None]:
X_test = test_set.copy()
y_test = X_test.pop("activity")

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_test = label_encoder.fit_transform(y_test)

# DATA PIPELINE

In [None]:
SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

num_attribs = ['Unnamed: 0', 'X1', 'age', 'gender', 'height', 'weight', 'steps',
       'hear_rate', 'calories', 'distance', 'entropy_heart', 'entropy_setps',
       'resting_heart', 'corr_heart_steps', 'norm_heart', 'intensity_karvonen',
       'sd_norm_heart', 'steps_times_distance']

cat_attribs = ['device']

num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                             RobustScaler())

cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                             OneHotEncoder(handle_unknown="ignore"))

preprocessing = make_column_transformer(
    (num_pipeline, num_attribs),
    (cat_pipeline, cat_attribs))

In [None]:
preprocessing

In [None]:
def print_score(classifier, X_train, y_train, X_test, y_test):
        
    # Training set
    
    print('\n\n')

    print("TRAINING RESULTS:\n")

    # Predict
    y_train_pred = classifier.predict(X_train)

    # Evaluation
    print(f'Classification Report:\n{classification_report(y_train, y_train_pred, digits = 4)}\n')
    
#     print(f'ROC AUC Score: {roc_auc_score(y_train, y_train_pred)}\n')

    print(f'Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}\n')
    
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,6))
    ax[0].set_title("train")
    ax[1].set_title("test")
    
    print(sns.heatmap(confusion_matrix(y_train, y_train_pred), annot=True, fmt="g", ax=ax[0]))
    
    print('\n\n')
    
    # Test set

    print("TEST RESULTS:\n")

    # Predict
    y_test_pred = classifier.predict(X_test)

    # Evaluation
    print(f'Classification Report:\n{classification_report(y_test, y_test_pred, digits = 4)}\n')

#     print(f'ROC AUC Score: {roc_auc_score(y_test, y_test_pred)}\n')

    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}\n')
    
    print(sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, fmt="g", ax=ax[1]))
    
    print('\n\n')

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
import scipy.stats as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
# from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
classifier = make_pipeline(preprocessing, LinearRegression())

classifier.fit(X_train,y_train)

In [None]:
classifier = Pipeline([
    ("preprocessing", preprocessing),
    ("logistic_regression", LogisticRegression()),
])

classifier.fit(X_train,y_train)

In [None]:
classifier.named_steps["logistic_regression"].get_params()

In [None]:
print_score(classifier, X_train, y_train, X_test, y_test)

In [None]:
# full_pipeline = Pipeline([
#     ("preprocessing", preprocessing),
#     ("logistic_regression", LogisticRegression()),
# ])

# logistic_param = [
#     {
#         'logistic_regression__solver' : ['newton-cg', 'lbfgs', 'liblinear'],
#         'logistic_regression__penalty': ['l1','l2'],
#         'logistic_regression__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#         'logistic_regression__max_iter' : [1000],
#     }
# ]

# grid_search = GridSearchCV(full_pipeline, logistic_param, cv = 5, scoring='accuracy', n_jobs = -1)

# grid_search.fit(X_train, y_train)

In [None]:
# print("Tuned Hyperparameters :\n", grid_search.best_params_)
# print('\n')
# print("Accuracy:\n",grid_search.best_score_)

In [None]:
# cv_res = pd.DataFrame(grid_search.cv_results_)

# cv_res.sort_values(by="mean_test_score", ascending = False, inplace=True)

# cv_res.head()

In [None]:
# C = grid_search.best_params_['logistic_regression__C']
# penalty = grid_search.best_params_['logistic_regression__penalty']
# solver = grid_search.best_params_['logistic_regression__solver']
# max_iter = grid_search.best_params_['logistic_regression__max_iter']

# classifier = Pipeline([
#     ("preprocessing", preprocessing),
#     ("logistic_regression", LogisticRegression(C=C, penalty = penalty, solver = solver, max_iter = max_iter)),
# ])

# classifier.fit(X_train,y_train)

In [None]:
C = 100
penalty = 'l2'
solver = 'newton-cg'
max_iter = 1000

classifier = Pipeline([
    ("preprocessing", preprocessing),
    ("logistic_regression", LogisticRegression(C=C, penalty = penalty, solver = solver, max_iter = max_iter)),
])

classifier.fit(X_train,y_train)

In [None]:
classifier.named_steps["logistic_regression"].get_params()

In [None]:
feature_names = classifier[:-1].get_feature_names_out()

importances = pd.DataFrame(data={
    'Attribute': feature_names,
    'Importance': classifier.named_steps["logistic_regression"].coef_[0]
})

In [None]:
importances

In [None]:
importances = importances.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize = (14, 7))

plt.bar(x=importances['Attribute'], height = importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation=90)
plt.show()

In [None]:
lgbm = create_model("lightgbm")
plot_model(estimator = lgbm , plot= "learning")

# RANDOM FOREST

In [None]:
classifier = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestClassifier(random_state = 42)),
])

classifier.fit(X_train,y_train)

In [None]:
classifier.named_steps["random_forest"].get_params()

In [None]:
print_score(classifier, X_train, y_train, X_test, y_test)

In [None]:
features_importance = pd.DataFrame(
    {
        'Column': feature_names,
        'Feature importance': classifier.named_steps["random_forest"].feature_importances_
    }
).sort_values('Feature importance', ascending = False)

sns.set(font_scale = 2)
fig, ax = plt.subplots(figsize = (7, 10))
ax = sns.barplot(x = "Feature importance", y = "Column", data = features_importance, palette = "Set2", orient = 'h');

# XGBOOST

In [None]:
classifier = Pipeline([
    ("preprocessing", preprocessing),
    ("xgboost", XGBClassifier()),
])

classifier.fit(X_train,y_train)

In [None]:
classifier.named_steps["xgboost"].get_params()

In [None]:
print_score(classifier, X_train, y_train, X_test, y_test)

In [None]:
features_importance = pd.DataFrame(
    {
        'Column': feature_names,
        'Feature importance': classifier.named_steps["xgboost"].feature_importances_
    }
).sort_values('Feature importance', ascending = False)

sns.set(font_scale = 2)
fig, ax = plt.subplots(figsize = (7, 10))
ax = sns.barplot(x = "Feature importance", y = "Column", data = features_importance, palette = "Set2", orient = 'h');