In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
colnames = pd.read_csv('UCI HAR Dataset/features.txt', delim_whitespace=True, header=None, index_col=0)
colnames

In [None]:
duplicated_cols = colnames.loc[colnames.duplicated()].index

In [None]:
X = pd.read_csv('UCI HAR Dataset/train/X_train.txt', header=None, delim_whitespace=True, error_bad_lines=False)

In [None]:
X_unique = X.drop(duplicated_cols, axis='columns')

In [None]:
X_unique.columns = colnames.drop(duplicated_cols).loc[:,1]

In [None]:
X_unique.duplicated().sum()

In [None]:
X_unique.isnull().sum().sum()

In [None]:
y = pd.read_csv('UCI HAR Dataset/train/y_train.txt', header=None, names=['label'])
y['label'] = y['label'].map({
    1: 'WALKING',
    2: 'WALKING_UPSTAIRS',
    3: 'WALKING_DOWNSTAIRS',
    4: 'SITTING',
    5: 'STANDING',
    6: 'LAYING'})
df = pd.concat([X_unique,y], axis=1)

## Jak wygląda Y?

In [None]:
sns.histplot(df['label'])
plt.xticks(rotation=90)
plt.show()

Najmniej jest próbek z chodzenia po schodach. Co ciekawe, próbek ze schodzenia po schodach jest mniej niż z wchodzenia po schodach. Dane zostały podzielone po czasie. Możemy wywnioskować, że ludzie szybciej schodzą po schodach niż wchodzą.

In [None]:
def plot_var(varname):
    sns.displot(df, x=varname, hue='label', kind='kde')
    plt.show()

In [None]:


for varname in X_unique.columns:
    plot_var(varname)


In [None]:
corr_matrix = df.corr()

In [None]:
from itertools import combinations

def absHighPass(df, absThresh):
    passed = set()
    for (r,c) in combinations(df.columns, 2):
        if (abs(df.loc[r,c]) >= absThresh):
            passed.add(r)
            passed.add(c)
    passed = sorted(passed)
    return df.loc[passed,passed]

In [None]:
fig, ax = plt.subplots(figsize=(15,15))

sns.heatmap(data = absHighPass(corr_matrix,0.9), ax=ax)
plt.show()

In [None]:
high_var_cols = df.var().sort_values(ascending=False).head(20).index.tolist()
for varname in high_var_cols:
    plot_var(varname)

In [None]:
df.describe()

In [None]:
for varname in ['tGravityAcc-mean()-X',
                'tGravityAcc-mean()-Y',
                'tGravityAcc-mean()-Z']:
    plot_var(varname)

In [None]:
significant_cols = [
'tBodyAcc-max()-X',
'tGravityAcc-mean()-X',
'tGravityAcc-energy()-X',
'tGravityAcc-correlation()-X,Y',
'tBodyGyro-entropy()-Y',
'tBodyGyroJerk-entropy()-Z',
'tBodyAccMag-mad()',
'fBodyAccJerk-mad()-Y',
'fBodyAccJerk-maxInds-X',
'fBodyAccJerk-maxInds-Z',
'angle(X,gravityMean)'
]

for varname in significant_cols:
    plot_var(varname)

In [None]:
small_df = df.loc[:, significant_cols]
small_corr = small_df.corr()

In [None]:
small_df.isnull().sum()

In [None]:
fig, ax = plt.subplots(figsize=(10,8))

sns.heatmap(data = small_corr, ax=ax, annot=True)
plt.show()