In [10]:
import pandas as pd

In [None]:
df = pd.read_csv('train.csv').drop(columns='subject')

In [None]:
#Output Column Categorical Value Classes
df['Activity'].value_counts()

Unnamed: 0_level_0,count
Activity,Unnamed: 1_level_1
LAYING,1407
STANDING,1374
SITTING,1286
WALKING,1226
WALKING_UPSTAIRS,1073
WALKING_DOWNSTAIRS,986


In [None]:
#Checking present number of columns (562)
df.shape

(7352, 562)

In [None]:
# Separate X and y
# Apply LabelEncoding on y
# Apply Logistic Regression
# Model Evaluation

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.drop(columns='Activity')
y = df['Activity']

#Label Encoding is applied only on target column
label = LabelEncoder()
label.fit_transform(y)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [None]:
# Applying Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train,y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

0.9809653297076818


## As we can see, with 561 input features, LogisticRegression gives anaccuracy score of 98%. We will delete the irrelevant features and select only the relevant features, reapply logistic regression and checkaccuracy score again

# Feature Selection

## 1. Removing Duplicate Columns

In [None]:
def get_duplicate_columns(df):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns



In [None]:
duplicate_columns = get_duplicate_columns(X_train)
duplicate_columns

{'tBodyAccMag-mean()': ['tBodyAccMag-sma()',
  'tGravityAccMag-mean()',
  'tGravityAccMag-sma()'],
 'tBodyAccMag-std()': ['tGravityAccMag-std()'],
 'tBodyAccMag-mad()': ['tGravityAccMag-mad()'],
 'tBodyAccMag-max()': ['tGravityAccMag-max()'],
 'tBodyAccMag-min()': ['tGravityAccMag-min()'],
 'tBodyAccMag-energy()': ['tGravityAccMag-energy()'],
 'tBodyAccMag-iqr()': ['tGravityAccMag-iqr()'],
 'tBodyAccMag-entropy()': ['tGravityAccMag-entropy()'],
 'tBodyAccMag-arCoeff()1': ['tGravityAccMag-arCoeff()1'],
 'tBodyAccMag-arCoeff()2': ['tGravityAccMag-arCoeff()2'],
 'tBodyAccMag-arCoeff()3': ['tGravityAccMag-arCoeff()3'],
 'tBodyAccMag-arCoeff()4': ['tGravityAccMag-arCoeff()4'],
 'tBodyAccJerkMag-mean()': ['tBodyAccJerkMag-sma()'],
 'tBodyGyroMag-mean()': ['tBodyGyroMag-sma()'],
 'tBodyGyroJerkMag-mean()': ['tBodyGyroJerkMag-sma()'],
 'fBodyAccMag-mean()': ['fBodyAccMag-sma()'],
 'fBodyBodyAccJerkMag-mean()': ['fBodyBodyAccJerkMag-sma()'],
 'fBodyBodyGyroMag-mean()': ['fBodyBodyGyroMag-sma()'

In [None]:
type(duplicate_columns)

dict

In [None]:
## duplicate_columns is a list with one column as key and duplicate columns as values
## Deleting the duplicate columns

for dups_list in duplicate_columns.values():
  X_train.drop(columns=dups_list,inplace=True)
  X_test.drop(columns=dups_list,inplace=True)

## 2. Variance Threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.05)
vt.fit(X_train)

In [None]:
columns = X_train.columns[vt.get_support()]

In [None]:
#X_train, X_test become arrays with transform
X_train = vt.transform(X_train)
X_test = vt.transform(X_test)

In [None]:
# Converting X_train, X_test back to dataframes

X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

X_train.head()

Unnamed: 0,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,tBodyAcc-max()-Y,tBodyAcc-max()-Z,tBodyAcc-min()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,-0.994425,-0.994873,-0.994886,-0.994939,-0.993994,-0.99545,-0.938974,-0.577031,-0.813863,0.846922,...,0.394506,-0.684497,-0.901888,0.751089,0.626432,0.263401,-0.646225,0.398881,-0.546717,-0.4312
1,-0.326331,0.069663,-0.224321,-0.343326,0.039623,-0.256327,-0.310961,0.085617,-0.411806,0.271334,...,0.052089,-0.131946,-0.591721,0.628099,-0.357089,-0.925512,0.110212,-0.474013,0.383162,0.238969
2,-0.02622,-0.032163,0.393109,-0.118256,-0.030279,0.432861,0.370607,-0.072309,0.200747,0.118277,...,-0.038923,-0.64366,-0.935668,-0.182993,0.484857,-0.961034,0.147325,-0.819001,0.221549,-0.003571
3,-0.981092,-0.901124,-0.960423,-0.984417,-0.901405,-0.965788,-0.922291,-0.524676,-0.807362,0.82537,...,-0.145084,-0.32686,-0.645937,0.205896,-0.168239,-0.197557,0.328892,-0.744428,0.274256,0.031246
4,-0.99738,-0.983893,-0.984482,-0.997331,-0.985196,-0.983768,-0.942062,-0.564033,-0.810993,0.85333,...,0.096524,-0.113724,-0.445144,-0.267371,-0.304399,-0.081555,0.110146,-0.615329,0.322236,0.161577


## 3. Correlation

In [None]:
#Visualize heatmap
#import seaborn as sns
#sns.heatmap(X_train.corr(),annot=True)

In [None]:
corr_mat = X_train.corr()

In [None]:
cols = corr_mat.columns
cols_to_drop = []

for i in range(len(cols)):
  for j in range(i+1,len(cols)):
    if corr_mat.loc[cols[i],cols[j]] > 0.95 :
      cols_to_drop.append(cols[j])

cols_to_drop = set(cols_to_drop)

In [None]:
X_train.shape[1], len(cols_to_drop)

(349, 197)

In [None]:
X_train = X_train.drop(columns=cols_to_drop,axis=1)
X_test = X_test.drop(columns=cols_to_drop,axis=1)

X_train.shape, len(cols_to_drop)

((5881, 152), 197)

## ANOVA


In [None]:
from sklearn.feature_selection import f_classif, SelectKBest

k_best = SelectKBest(f_classif,k=100)
k_best.fit(X_train,y_train)

In [None]:
# Select the columns before transforming X_train, X_test
k_cols = X_train.columns[k_best.get_support()]

#transform converts df to array
X_train = k_best.transform(X_train)
X_test = k_best.transform(X_test)

#convert array back to df
X_train = pd.DataFrame(X_train,columns=k_cols)
X_test = pd.DataFrame(X_test,columns=k_cols)

X_train.shape[1]

100

# MOMENT OF TRUTH

In [None]:
# Reapplying logistic regression and evaluating model performance

log_reg1 = LogisticRegression(max_iter=1000)
log_reg1.fit(X_train,y_train)

y_pred1 = log_reg1.predict(X_test)

accuracy1 = accuracy_score(y_test,y_pred1)
accuracy1

0.9694085656016316

## 5. Chi -Square
### It is performed on categorical columns only
### Therefore we work with Titatnic dataset

## Method 1
## Chi2 from scipy.stats

In [11]:
df1 = pd.read_csv('/content/titanic_data.csv')[['Pclass','Sex','SibSp','Parch','Embarked','Survived']]

df1.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Survived
0,3,male,1,0,S,0
1,1,female,1,0,C,1
2,3,female,0,0,S,1
3,1,female,1,0,S,1
4,3,male,0,0,S,0


In [7]:
!pip install scipy.stats

[31mERROR: Could not find a version that satisfies the requirement scipy.stats (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for scipy.stats[0m[31m
[0m

In [42]:
from scipy import stats
from scipy.stats import chi2_contingency as chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


score = []
tit_cols = df1.drop(columns='Survived').columns

for col in df1.columns[:-1]:
  ct = pd.crosstab(df1[col],df1['Survived'])
  chi2_stat, p_value, dof, expected = chi2(ct)
  score.append([col,p_value])

  # Low p-value means high relationship

In [37]:
# Convert the list of tuples into a DataFrame for easy sorting
p_values_df = pd.DataFrame(score, columns=['Feature', 'p-value'])

# Sort the DataFrame by p-value in ascending order
p_values_df_sorted = p_values_df.sort_values(by='p-value', ascending=True)

# Select the top 3 features with the least p-values
top_3_features = p_values_df_sorted.head(3)

# Display the top 3 features with the least p-values
chi2_cols = top_3_features['Feature']

In [39]:
X = df1.drop(columns='Survived')
y = df1['Survived']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Create a LabelEncoder object
le = LabelEncoder()

# Apply label encoding to the 'Sex' and 'Embarked' columns
X_train['Sex'] = le.fit_transform(X_train['Sex'])
X_test['Sex'] = le.transform(X_test['Sex'])
X_train['Embarked'] = le.fit_transform(X_train['Embarked'].astype(str))  # Handle NaN
X_test['Embarked'] = le.transform(X_test['Embarked'].astype(str))       # Handle NaN


log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.770949720670391

# Method 2
# sklearn.feature_selection import chi2

```
# This is formatted as code
```



In [44]:
from sklearn.feature_selection import chi2

le = LabelEncoder()
titanic_encoded = df1.copy()
titanic_encoded['Sex'] = le.fit_transform(titanic_encoded['Sex'])
titanic_encoded['Embarked'] = le.fit_transform(titanic_encoded['Embarked'].astype(str))

X = titanic_encoded.drop('Survived', axis=1)
y = titanic_encoded['Survived']

# Calculate chi-squared stats
chi_scores = chi2(X, y)

# chi_scores[1] are the p-values of each feature.
p_values = pd.Series(chi_scores[1], index = X.columns)
p_values.sort_values(inplace = True)

# Create a LabelEncoder object
le = LabelEncoder()

# Apply label encoding to the 'Sex' and 'Embarked' columns
X_train['Sex'] = le.fit_transform(X_train['Sex'])
X_test['Sex'] = le.transform(X_test['Sex'])
X_train['Embarked'] = le.fit_transform(X_train['Embarked'].astype(str))  # Handle NaN
X_test['Embarked'] = le.transform(X_test['Embarked'].astype(str))       # Handle NaN


log_reg1 = LogisticRegression(max_iter=1000)
log_reg1.fit(X_train,y_train)
y_pred1 = log_reg1.predict(X_test)
accuracy = accuracy_score(y_test,y_pred1)
accuracy

0.770949720670391