In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1ISEL4hYIGNcqZmQ2vZNry2nxP-eWBtC_WKzACYkJDQU/export?format=csv&gid=1163006501')
df.drop(columns = 'Timestamp', inplace = True)

df.head(5)

In [None]:
transformer = dict()
transformed_df = pd.DataFrame()

for col in df.columns:
    transformer[col] = preprocessing.LabelEncoder()
    transformed_df[col] = transformer[col].fit_transform(df[col])
    
(train_df, test_df) = train_test_split(transformed_df, test_size = 0.2)
train_df

In [None]:
%matplotlib inline 
sns.heatmap(transformed_df.corr(), cmap = 'rocket_r')

In [None]:
questions = list(df.columns)
print(*(f'questions[{q_idx}]: {q.strip()}' for (q_idx, q) in enumerate(questions)), sep = '\n')

In [None]:
features = [
	questions[0],
	questions[1],
	questions[2],
	questions[3]
]

label = questions[4]

if label in features:
    raise KeyError('feature containing label data. Try using another question.')

if len(features) != 4:
    raise ValueError('only 4 question allowed!!!')

In [None]:
x = train_df[features]
y = train_df[label]

dtree = RandomForestClassifier()
dtree = dtree.fit(x, y)

sample = test_df
x_sample = sample[features].copy()

for col in x_sample.columns:
    x_sample.loc[:, col] = transformer[col].inverse_transform(x_sample[col])

x_sample.loc[:, f'{label} (Prediction)'] = transformer[label].inverse_transform(dtree.predict(sample[features]))
x_sample.loc[:, f'{label} (Actual)'] = transformer[label].inverse_transform(sample[label])
x_sample.loc[:, 'Confidence (%)'] = [round(float(max(e)), 2) for e in dtree.predict_proba(sample[features]) * 100]

print(f'Prediction accuracy: {dtree.score(sample[features], sample[label])*100:.2f}%')
print(f'Average confidence: {x_sample["Confidence (%)"].mean():.2f}%')

x_sample