<a href="https://colab.research.google.com/github/Waranika/DCU-EE514-DA-ML/blob/main/Assignement/Assignement_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
path = "/content/drive/MyDrive/The ExtraSensory Dataset/ExtraSensory.per_uuid_features_labels"

In [None]:
files = [f for f in os.listdir(path) if f.endswith('.gz')]
print(files)

In [None]:
df = pd.DataFrame()
# Loop through each .gz file, unzip, and append to the DataFrame
for gz_file in files:
    with gzip.open(os.path.join(path, gz_file), 'rt') as f:
        # Assuming the CSV file inside the .gz is comma-separated, adjust delimiter if needed
        data = pd.read_csv(f, delimiter=',')
        df = pd.concat([df, data], ignore_index=True)

In [None]:
print(df)

*DATA REPRESENTATION*

In [None]:
df.describe()

In [None]:
# Extracting columns starting with 'proc' or 'raw'
raw_data_readings = df.filter(regex='^(proc|raw)')

# Describing the selected columns
raw_data_readings.describe()


In [None]:
# Extracting columns starting with 'proc' or 'raw'
raw_watch_readings = df.filter(like='watch')

# Describing the selected columns
raw_watch_readings.describe()

In [None]:
missing_values_raw_data = raw_data_readings.isnull().sum()
missing_values_raw_watch = raw_watch_readings.isnull().sum()
print(missing_values_raw_data)
print(missing_values_raw_watch)

In [None]:
# Calculate the overall average of missing values for each DataFrame
overall_average_missing_raw_data = raw_data_readings.isnull().mean().mean()
overall_average_missing_raw_watch = raw_watch_readings.isnull().mean().mean()

# Print the overall average missing values for each DataFrame
print("Overall Average Missing Values in raw_data_readings:", overall_average_missing_raw_data)
print("Overall Average Missing Values in raw_watch_readings:", overall_average_missing_raw_watch)

# Plotting the overall average missing values for each DataFrame
plt.figure(figsize=(10, 6))

# Plot for raw_data_readings
plt.bar("raw_data_readings", overall_average_missing_raw_data, color="blue", label="raw_data_readings")

# Plot for raw_watch_readings
plt.bar("raw_watch_readings", overall_average_missing_raw_watch, color="orange", label="raw_watch_readings")

plt.title('Overall Average Missing Values')
plt.ylabel('Average Missing Values')
plt.legend()
plt.show()

In [None]:
watch_columns = df.filter(like="watch")
print(watch_columns.sum())
df = df.drop(columns=watch_columns)
df.shape

In [None]:
# Extracting only the 'label' columns
label_columns = df.filter(like='label:')


# Counting the occurrences of value 1 in each 'label' column
label_counts = label_columns.sum()

# Plotting the bar graph
plt.figure(figsize=(12, 6))
label_counts.plot(kind='bar', color='skyblue')
plt.title('Frequency of Each Label with Value 1')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.show()

*PREPROCESSING*

Delete incomplete columns


In [None]:
count = 0
j = 0
for i in df.columns:
    if j > 225:
      break
    # Count the occurrences of 'na' in each column
    count = (df[i] == 'nan').sum()
    j = j+1
    # Check if the total count of 'na' values and NaN values exceeds a threshold
    if df[i].isnull().sum() + count > 377346*0.6 :
        # Delete the column if the condition is met
        print(i)
        print(df[i].isnull().sum() + count)
        del df[i]

In [None]:
df.shape

Delete irrelevant columns for the analysis

In [None]:
del df['timestamp']
df.shape

Delete known biased columns that might wrong the prediction

In [None]:
# Select columns starting with 'location' and 'label'
location_columns = df.filter(regex='^location')
label_columns = df.filter(regex='^label')

# Concatenate the selected columns
selected_columns = pd.concat([location_columns, label_columns], axis=1)

# Calculate the correlation matrix
correlation_matrix = selected_columns.corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# If you want to visualize the correlation matrix, you can use a heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap between Location and Label Columns')
plt.show()

In [None]:
df = df.drop(columns=location_columns)
df.shape

Check if there is correlation between column and label  to predict

In [None]:
df.columns

In [None]:
# Select columns that start with "raw", "proc", or "audio"
selected_columns = [col for col in df.columns if col.startswith(("raw", "proc", "audio"))]

# Extract the selected columns and create a new DataFrame
df_selected = df[selected_columns]

# Standardize the selected columns
scaler = StandardScaler()
df_selected_standardized = pd.DataFrame(scaler.fit_transform(df_selected), columns=selected_columns)

# Replace the original columns with the standardized ones in the original DataFrame
df[selected_columns] = df_selected_standardized

#For the sake of time the 500 first values of df are taken
df = df.head(500)

BUILD MODEL

In [None]:
y = df.filter(regex='^label:')
x = df.drop(columns= y)




#Replace missing by 0
y.fillna(0, inplace=True)
x.fillna(0, inplace=True)


X_encoded = pd.get_dummies(x)
y_encoded = pd.get_dummies(y)

X_train, X_test, y_train, y_test = train_test_split(X_encoded.values, y_encoded.values, test_size=0.2, random_state=42)

In [None]:
missing_columns = df.columns[df.isnull().any()]
missing_values = df[missing_columns].isnull().sum()

print("Columns with Missing Values:")
print(missing_values)

In [None]:
print(X_train)

In [None]:
# Define the classifier (Random Forest in this case)
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a MultiOutputClassifier
multi_output_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

# Fit the model
multi_output_classifier.fit(X_train, y_train)



CHECK RESULTS


In [None]:
# Make predictions
y_pred = multi_output_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))