In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import VarianceThreshold

import warnings
warnings.filterwarnings ('ignore')

import joblib

In [2]:
data= pd.read_csv('Human Activity Reco. with smartphone data.csv')

In [3]:
data

In [4]:
data.shape

In [5]:
data.describe()

In [6]:
data.info()

In [7]:
list(data.columns)

In [8]:
data.isna().sum()

In [9]:
data['Activity'].value_counts().sort_values().plot(kind = 'bar', color = 'pink')

In [10]:
data.drop('subject', axis=1,inplace=True)

In [11]:
encoder= LabelEncoder()
data['Activity']=encoder.fit_transform(data['Activity'])

In [12]:
duplicate_columns = data.columns[data.T.duplicated()].tolist() 

In [13]:
data=data.drop(columns= duplicate_columns)

In [14]:
data.shape

In [15]:
X= data.drop('Activity', axis=1)
y= data['Activity']

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
model= LogisticRegression()
model.fit(X_train,y_train)

In [18]:
Model_Score_Raw_data=model.score(X_test,y_test)

In [19]:
model.score(X_train,y_train)

In [20]:
selector= VarianceThreshold(threshold=0.05)
selected_features= selector.fit(X_train)
X_train_high_variance= selector.transform(X_train)
X_test_high_variance= selector.transform(X_test)



In [21]:
# Total High Variance Features Only, Less than threshold features are already removed
sum(selected_features.get_support())

In [22]:
selected_indices=selected_features.get_support()
selected_feature_names= X_train.columns[selected_indices]

In [23]:
X_train_vt= pd.DataFrame(X_train_high_variance,columns= selected_feature_names)
X_test_vt= pd.DataFrame(X_test_high_variance,columns= selected_feature_names)

In [24]:
X_train_vt.shape, X_test_vt.shape

In [25]:
model=  LogisticRegression()
model.fit(X_train_vt,y_train)

In [26]:
Model_Score_After_vt=model.score(X_test_vt,y_test)

In [27]:
model.score(X_train_vt,y_train)

In [28]:
X_train_vt['Activity']= data['Activity']
X_train_vt.shape

In [29]:
X_train_vt

In [30]:
threshold= 0.0001
correlation_matrix= X_train_vt.corr()
activity_corr= correlation_matrix['Activity']


In [31]:
#Method-1 (Find Features, those are highly correlated to target feature and then train and test then predict model score)

selected_features = correlation_matrix[
    (correlation_matrix['Activity'] > threshold) &
    (correlation_matrix.index != 'Activity')
]

print("Selected features:")
print(selected_features)

In [32]:
selected_feature_names= selected_features.index.to_list()

In [33]:
X_train_vt_ac= pd.DataFrame(X_train_vt, columns=selected_feature_names)
X_test_vt_ac= pd.DataFrame(X_test_vt, columns=selected_feature_names)

In [34]:
X_train_vt_ac.shape

In [35]:
model= LogisticRegression()
model.fit(X_train_vt_ac,y_train)

In [36]:
Model_Score_After_one_to_many_corr=model.score(X_test_vt_ac,y_test)

In [37]:
# Method-2 (Find those features, which are highly correlated to eachother)

threshold= 0.95
correlation_matrix= X_train_vt.corr()

highly_correlated_features= set()

for i in range (len(correlation_matrix.columns)):
    for j in range (i):
        if correlation_matrix.iloc[i,j]>threshold:
            highly_correlated_features.add(correlation_matrix.columns[i])
            highly_correlated_features.add(correlation_matrix.columns[j])




In [38]:
X_train_vt.drop(columns= highly_correlated_features , axis=1, inplace= True)
X_test_vt.drop(columns= highly_correlated_features, axis=1, inplace= True)

In [39]:
X_train_vt.drop('Activity', axis=1, inplace= True ) #Activity we add only just to find correlation between each and every column

In [40]:
X_train_ac2=pd.DataFrame(X_train_vt, columns=X_test_vt.columns)
X_test_ac2=pd.DataFrame(X_test_vt, columns=X_test_vt.columns)

In [41]:
X_train_ac2.shape

In [42]:
model= LogisticRegression()
model.fit(X_train_ac2,y_train)


In [43]:
Model_Score_After_one_to_one_corr=model.score(X_test_ac2, y_test)
Model_Score_After_one_to_one_corr

In [44]:
print('Model_Score_Raw_data:',Model_Score_Raw_data)
print('Model_Score_After_vt:',Model_Score_After_vt)
print('Model_Score_After_one_to_many_corr:',Model_Score_After_one_to_many_corr)
print('Model_Score_After_one_to_one_corr:',Model_Score_After_one_to_one_corr)