In [None]:
import numpy as np #used for creating and manipulating arrays.
import pandas as pd #used for creating Series and DataFrames
import matplotlib.pyplot as plt #used for data visualization
import seaborn as sns #used for data visualization
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('out_prog.csv')
df.head()

In [None]:
df_1 = pd.read_csv('out_prog_1.csv')

In [None]:
df.shape

In [None]:
#let's check the info of the dataset
df.info()

# Numerical and Categorical Column Divison

In [None]:
numeric_cols=df._get_numeric_data().columns
numeric_cols

In [None]:
categorical_cols=df.drop(columns=numeric_cols).columns
categorical_cols

# Descriptive Analysis

In [None]:
df['Gender'].value_counts().plot(kind='pie',figsize=(10,5),autopct='%1.1f%%')

In [None]:
df['Which part of the session you enjoyed the most?'].value_counts().plot(kind='barh',figsize=(10,7), color='green')
plt.xlabel('Which part of the session you enjoyed the most?')

In [None]:
df['Which part of the session you enjoyed the least?'].value_counts().plot(kind='barh', figsize=(10,5) ,color='red')
plt.xlabel('Which part of the session you enjoyed the least?')

In [None]:
# performing Label Encoding for categorical columns from 1 to 9
from sklearn.preprocessing import LabelEncoder

In [None]:
df_cat = df_1.iloc[:,0:9]
df_cat.head()

In [None]:
def labelling(x):
    df_1[x] = LabelEncoder().fit_transform(df_1[x])
    return df_1

df_cat = df_1.iloc[:,0:9]
for i in df_cat:
    labelling(i)
    
df_1.head()

In [None]:
df_cat.head()

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr())
plt.show()

In [None]:
plt.figure(figsize=(15,8))
plt.hist(df['Are you more aware and confident about your career choices as well as decision making now?'],edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Number of Student')

In [None]:
plt.rcParams['figure.figsize'] = (12,14)
df_1.hist()

In [None]:
# converting 11 categorical columns into array
x = df_1.values
x

In [None]:
# calculating mean of 11 columns
round(df.iloc[:,0:11].mean(),2)

# Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [None]:
pca_data = preprocessing.scale(x)

In [None]:
pca = PCA(n_components=8)
pc = pca.fit_transform(x)
print(pc)

In [None]:
# Inserting pc into a dataFrame
colnames = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8']
df_pc = pd.DataFrame(data = pc, columns=colnames)
df_pc.head()

In [None]:
df_pc.describe()

In [None]:
pca_variance = pca.explained_variance_ratio_
pca_variance

In [None]:
np.cumsum(pca_variance)

In [None]:
from bioinfokit.visuz import cluster

In [None]:
# get PC scores
pca_scores = PCA().fit_transform(x)

In [None]:
# get 2D biplot
cluster.biplot(cscore=pca_scores, loadings=pca.components_, labels=df.columns.values, var1=round(pca.explained_variance_ratio_[0]*100, 2),
    var2=round(pca.explained_variance_ratio_[1]*100, 2),show=True,dim=(10,5))

In [None]:
# K-means
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=0).fit(df_1)
df['Do you have better clarity now on your next steps than before the program?'] = kmeans.labels_#adding to df
print ('Labels to input data :', kmeans.labels_) #Label assigned for each data point
print ('kMeans Inertia :',kmeans.inertia_) #gives within-cluster sum of squares. 
print('No. of iterations :', kmeans.n_iter_) #number of iterations that k-means algorithm runs to get a minimum within-cluster sum of squares
print('Centriods Location:\n', kmeans.cluster_centers_) #Location of the centroids on each cluster. 

In [None]:
df['Do you have better clarity now on your next steps than before the program?'].value_counts()

In [None]:
from collections import Counter

In [None]:
#To see each cluster size
Counter(kmeans.labels_)

In [None]:
#Visulazing clusters
sns.scatterplot(data=df_pc, x="PC1", y="PC2", hue=kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
            marker="X", c="r", s=80, label="centroids")
plt.legend()
plt.show()

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

In [None]:
#create demogram and find the best clustering value
merg = linkage(df_1, method='ward')
plt.rcParams['figure.figsize'] = (7,5)
dendrogram(merg,leaf_rotation = 90)
plt.xlabel("data points", fontsize = 15)
plt.ylabel("euclidean distance", fontsize=15)
plt.show()

# Describing Segments

In [None]:
from statsmodels.graphics.mosaicplot import mosaic
from itertools import product

In [None]:
#Reordering cols
crosstab =pd.crosstab(df['Do you have better clarity now on your next steps than before the program?'],df['Which part of the session you enjoyed the least?'])
crosstab 

In [None]:
plt.rcParams['figure.figsize'] = (15,10)
mosaic(crosstab.stack())
plt.show()

In [None]:
#Mosaic plot gender vs segment
crosstab_gender =pd.crosstab(df['Do you have better clarity now on your next steps than before the program?'],df['Gender'])
crosstab_gender

In [None]:
plt.rcParams['figure.figsize'] = (7,5)
mosaic(crosstab_gender.stack())
plt.show()

In [None]:
#box plot for age
sns.boxplot(x="Do you have better clarity now on your next steps than before the program?", y="Age", data=df)

In [None]:
df_1.head()

# Selecting The Target

In [None]:
#Calculating the mean
df['Which part of the session you enjoyed the least?'] = LabelEncoder().fit_transform(df['Which part of the session you enjoyed the least?'])
visit = df.groupby('Do you have better clarity now on your next steps than before the program?')['Which part of the session you enjoyed the least?'].mean()
visit = visit.to_frame().reset_index()
visit

In [None]:
#Like
df['Which part of the session you enjoyed the most?'] = LabelEncoder().fit_transform(df['Which part of the session you enjoyed the most?'])
Like = df.groupby('Do you have better clarity now on your next steps than before the program?')['Which part of the session you enjoyed the most?'].mean()
Like = Like.to_frame().reset_index()
Like

In [None]:
#Gender
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
Gender = df.groupby('Do you have better clarity now on your next steps than before the program?')['Gender'].mean()
Gender = Gender.to_frame().reset_index()
Gender

In [None]:
segment = Gender.merge(Like, on='Do you have better clarity now on your next steps than before the program?', how='left').merge(visit, on='Do you have better clarity now on your next steps than before the program?', how='left')
segment

In [None]:
#Target segments
plt.rcParams['figure.figsize'] = (7,5)
sns.scatterplot(x = "Which part of the session you enjoyed the least?", y = "Which part of the session you enjoyed the most?",data=segment,s=400, color="r")
plt.title("Simple segment evaluation plot for the feedback evaluation \n",fontsize = 15) 
plt.xlabel("Like", fontsize = 15) 
plt.ylabel("Need Improvement", fontsize = 15) 
plt.show()