![Header](https://storage.googleapis.com/kaggle-competitions/kaggle/31480/logos/header.png)

In [None]:
from IPython.core.display import display, HTML, Javascript

def nb():
    styles = open("../input/intermediate-notebooks-data/custom_pink.css", "r").read()
    return HTML("<style>"+styles+"</style>")
# nb()

# Introduction 📝

- Yeah!!! Folks, Entering an EDA competition for the first time. Hopefully looking to find and learn something new. 
- Have a couple of notebooks public but this is the first one with atleast some explaination and details about what I would be doing
- Analyzing survey data is important to study the market, demographics and current trends and make data-driven decision derived from the insights generated from this data.
- You may find below some simple and easy to understand visualizations created using seaborn, plotly and folium.

# Import libraries 📚

In [None]:
!pip install country_converter

import os
import re
import time
import datetime
import warnings
from tqdm import tqdm

import requests

# Data Manipulation Libraries
import geopandas as gpd
import pandas as pd
import numpy as np

# Data Vizualization libraries
import seaborn as sns

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import branca
import folium
from folium import GeoJson, Choropleth, GeoJsonTooltip
from folium.plugins import MarkerCluster

#Geocoding Libraries
import country_converter as coco
from geopy.geocoders import Nominatim

# Text Processing Libraries
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 

import spacy
from spacy import displacy

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter
from wordcloud import WordCloud,STOPWORDS

warnings.filterwarnings("ignore")
nltk.download('stopwords')

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

In [None]:
#defining colour palette
red = ["#4f000b","#720026","#ce4257","#ff7f51","#ff9b54"]
bo = ["#6930c3","#5e60ce","#0096c7","#48cae4","#ade8f4","#ff7f51","#ff9b54","#ffbf69"]
pink = ["#aa4465","#dd2d4a","#f26a8d","#f49cbb","#ffcbf2","#e2afff","#ff86c8","#ffa3a5","#ffbf81","#e9b827","#f9e576"]
custom_palette(pink)
custom_palette(bo)
custom_palette(red)

In [None]:
#set context to customize and style plots
sns.set_context("poster", font_scale = 0.6, rc={"grid.linewidth": 0.4})

#set font family
sns.set_style({'font.family':'serif'})

In [None]:
#reading csv file
survey_df =  pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")

# General Stats

In [None]:
fig = go.Figure()

fig.add_trace(
        go.Indicator(
        mode = "number",
        value = survey_df.shape[0],
        title = {
        "text": """Total Number of Responses"""
        },
        domain = {'x': [0, 0.5], 'y': [0, 1]},
    )
)

fig.add_trace(
    go.Indicator(
        mode = "number",
        value = 42,
        title = {
            "text": """Number of Questions Asked"""
        },
        domain = {'x': [0.6, 1], 'y': [0, 1]}
    )
)

fig.update_layout(paper_bgcolor = "lightgray", height=300)

fig.show()

In [None]:
hist_data = survey_df[["Time from Start to Finish (seconds)"]]
hist_data.rename(columns={"Time from Start to Finish (seconds)":"Time from Start to Finish (minutes)"}, inplace=True)
hist_data = hist_data["Time from Start to Finish (minutes)"][1:].apply(int)/60

fig = px.box(
    hist_data, 
#     log_x=True,
    orientation='h',
    notched=True,
    title="Distribution of time spent by individual to fill in the survey"
)
fig.update_xaxes(range=[0, 60])
fig.update_layout(
    title = dict(
        font_size = 25,
    ),
    title_x=0.5,
    xaxis_title = 'Minutes',
)
fig.show()

In [None]:
hist_time_age_data = survey_df.loc[1:,["Time from Start to Finish (seconds)","Q1"]]
hist_time_age_data.rename(columns={"Time from Start to Finish (seconds)":"Time from Start to Finish (minutes)"}, inplace=True)
hist_time_age_data["Time from Start to Finish (minutes)"] = hist_time_age_data["Time from Start to Finish (minutes)"].apply(int)/60

fig = px.box(
    hist_time_age_data, 
    x='Q1',y="Time from Start to Finish (minutes)", 
    color='Q1',
    notched=True,
    title="Distribution of time spent by individual by age to fill in the survey"
)
fig.update_yaxes(range=[0, 60])
fig.update_layout(
    title = dict(
        font_size = 25,
    ),
    title_x=0.5,
    xaxis_title = 'Age Groups',
    yaxis_title = 'Time spent in minutes',
)
fig.show()

Median time spent by an indiviual participating in the survey is about **11 minutes**. While mean time spent by the above group is about **35 minutes**.

# Table: Questions Asked

In [None]:
fig = go.Figure(
    data=[
        go.Table(
        header=dict(
                values=["Question Number / Sections / Parts", "Description"],
                fill_color=bo[2],
                line_color='white',
                align='center'
        ),
        cells=dict(
                values=[
                    [i.replace('_'," ") for i in survey_df.columns[1:]],
                    survey_df.iloc[0,1:]
                ],
                fill_color=bo[4],
                line_color='white',
                align='left'
            )
        )
    ]
)
fig.update_layout(
    title = dict(
        text = 'Questions Asked in Survey 2021',
        font_size = 25,
    ),
    title_x=0.5,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)
fig.show()

# Age Distribution

In [None]:
age_dist_df = survey_df.iloc[1:,1:]
age_dist_df = age_dist_df.groupby("Q1").agg('count')[["Q2"]].sort_values("Q2",ascending=False).reset_index()
age_dist_df.columns = ["age_grp","Number of Responses"]
age_dist_df['Percentage of Responses'] = round((age_dist_df["Number of Responses"]/sum(age_dist_df["Number of Responses"]))*100,2)

fig = px.bar(
    age_dist_df, 
    x="age_grp", 
    y="Percentage of Responses",
    color='Percentage of Responses',
)
fig.update_layout(
    title = dict(
        text = 'Generic Age Distribution',
        font_size = 25,
    ),
    title_x=0.5,
    xaxis_title = 'Age Group',
    yaxis_title = 'Percentage of Responses',
)
fig.show()

In [None]:
age_dist_df = survey_df.iloc[1:,1:]
age_dist_df = age_dist_df.groupby(["Q1","Q2"]).agg('count')[["Q3"]].reset_index()
age_dist_df.columns = ["age_grp","Gender","num_resp"]
age_dist_df["Gender"] = age_dist_df["Gender"].replace(
    {
        "Nonbinary": "Prefer not to say",
        "Prefer to self-describe": "Prefer not to say",
    }
)

fig = px.bar(
    age_dist_df, 
    x="age_grp", 
    y="num_resp",
    color='Gender',
    barmode='group', 
    log_y=True
)
fig.update_layout(
    title = dict(
        text = 'Age Distribution by Gender',
        font_size = 25,
    ),
    title_x=0.5,
    xaxis_title = 'Age Group',
    yaxis_title = 'Number of Responses',
)
fig.show()

# Participant from different countries

In [None]:
# Converting country names to ISO 3166 alpha-3 codes
cc = coco.CountryConverter()
input_countries = survey_df.Q3[1:].unique()
country_codes = cc.convert(names = input_countries, to = 'ISO3',not_found=np.nan)
country_codes = dict(zip(input_countries,country_codes))
country_names = dict(zip(country_codes,input_countries))

# Geojson with country boundaries
country_geodf = gpd.read_file("https://datahub.io/core/geo-countries/r/countries.geojson")

# Merging GeoJSON with AGGREGATED dataframe
country_df = survey_df.iloc[1:,1:]
country_df['ISO_A3'] = country_df['Q3'].apply(lambda x: country_codes[x])
country_df = country_df.groupby('ISO_A3').count()[['Q3']].reset_index()
country_df["Q3"] = round(country_df['Q3']/sum(country_df['Q3'])*100,2)
country_df = country_df.sort_values('Q3',ascending=False)
country_df = country_geodf.merge(country_df,on='ISO_A3')

In [None]:
country_map = folium.Map(location=[40, 0], zoom_start=1.5)

Choropleth(
    geo_data=country_df,
    data=country_df,
    columns=['ADMIN',"Q3"],
    key_on="feature.properties.ADMIN",
    fill_color='YlOrRd',
    fill_opacity=1,
    line_opacity=0.2,
    legend_name="Percentage of Responses",
    smooth_factor=0,
    Highlight= True,
    bins=3,
    line_color = "#0000",
    name = "Percentage of Responses",
    show=True,
    overlay=True,
    nan_fill_color = "White"
).add_to(country_map)

# Add hover functionality.
style_function = lambda x: {
    'fillColor': '#ffffff', 
    'color':'#000000', 
    'fillOpacity': 0.1, 
    'weight': 0.1
}
highlight_function = lambda x: {
    'fillColor': '#000000', 
    'color':'#000000', 
    'fillOpacity': 0.50, 
    'weight': 0.1
}

NIL = GeoJson(
    data = country_df,
    style_function=style_function, 
    control=False,
    highlight_function=highlight_function, 
    tooltip=GeoJsonTooltip(
        fields=['ADMIN',"Q3"],
        aliases=['Country Name',"% of Responses"],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)
country_map.add_child(NIL)
country_map.keep_in_front(NIL)

folium.TileLayer('cartodbdark_matter',name="Dark mode",control=True).add_to(country_map)
folium.TileLayer('cartodbpositron',name="Light mode",control=True).add_to(country_map)
folium.LayerControl(collapsed=True).add_to(country_map)

country_map

Nearly 31% participants are from India, 11% participants are from USA.

Kaggle: The platform is used by students, professional and experts from more than 64 countries.

# Job Titles and Roles 

In [None]:
job_title_df = pd.DataFrame(survey_df.iloc[1:,1:]["Q5"].value_counts()).reset_index()
job_title_df.columns = ["Profession","Counts"]

fig = px.pie(
    job_title_df,
    values="Counts",
    names="Profession",
    title="Distrbution of Responses vs Profession",
    hole=.4,
)
fig.update_traces(textposition='outside')
fig.update_layout(
    title = dict(
        font_size = 25,
    ),
    title_x=0.5,
)
fig.show()

# Highest Level of Education

In [None]:
job_title_df = pd.DataFrame(survey_df.iloc[1:,1:]["Q4"].value_counts()).reset_index()
job_title_df.columns = ["Education","Counts"]

fig = px.pie(
    job_title_df,
    values="Counts",
    names="Education",
    title="Distrbution of Responses vs Highest level education",
    hole=.5,
)
fig.update_traces(textposition='outside')
fig.update_layout(
    title = dict(
        font_size = 25,
    ),
    title_x=0.5,
)
fig.show()

Kaggle is majorly used students and it has become a highly popular platform for learning and exploring the field of Machine Learning, Data Science and AI 

These industries are the ones which are growing, booming, expanding and finding its application in each and every domain possible.

# Profession & Coding Experience

In [None]:
work_exp_df = survey_df.iloc[1:,1:]

work_exp_df = pd.DataFrame(work_exp_df.groupby(["Q5","Q6"]).size()).reset_index()
work_exp_df.columns = ["profession", "Coding Experience", "counts"]
work_exp_df = work_exp_df.sort_values("counts",ascending=False)

fig = px.bar(
    work_exp_df, 
    x="profession",
    y="counts",
    color='Coding Experience',
    barmode='group', 
    log_y=True
)
fig.update_layout(
    title = dict(
        text = 'Profession and Coding Experience',
        font_size = 25,
    ),
    title_x=0.5,
    xaxis_title = 'Profession',
    yaxis_title = 'Number of Responses',
)
fig.show()

# Popular Programming Languages

In [None]:
def get_counts_for_question(
    question_num: int, division: str = None, column_names: list = ["Values","Counts"]
) -> pd.DataFrame:
    if division is not None:
        cols = [col for col in survey_df.columns if f"Q{question_num}_{division}" in col]
    else:
        cols = [col for col in survey_df.columns if f"Q{question_num}" in col]
        
    lists = [
        [
            survey_df[col][1:].value_counts().index[0],
            survey_df[col][1:].value_counts()[0]
        ] 
        for col in cols
    ]
    df = pd.DataFrame(lists, columns = column_names).sort_values(column_names[1], ascending=False)
    return df

In [None]:
programming_lang_df = get_counts_for_question(7, column_names=["Programming Language","Number of Responses"])

fig = px.pie(
    programming_lang_df,
    values="Number of Responses",
    names="Programming Language",
    title="Programming Languages used widely",
    hole=.7,
)
fig.update_traces(textinfo='label', textposition='outside')
fig.update_layout(
    title = dict(
        font_size = 25,
    ),
    title_x=0.5,
    showlegend=False
)
fig.show()

# Recommended Programming Language

In [None]:
recommeded_pl_df = pd.DataFrame(
    survey_df.iloc[1:,1:]["Q8"].value_counts(),
).reset_index()
recommeded_pl_df.columns = ["Programming Languages", "Recommendations"] 
recommeded_pl_df = recommeded_pl_df.sort_values("Recommendations", ascending=False)

fig = px.bar(
    recommeded_pl_df, 
    x="Recommendations", 
    y="Programming Languages",
    color='Programming Languages',
    orientation = 'h'
)
fig.update_layout(
    title = dict(
        text = survey_df.iloc[0]["Q8"].split("-")[0].strip(),
        font_size = 17,
    ),
    title_x=0.5,
    xaxis_title = 'Recommendations',
    yaxis_title = 'Programming Languages',
)
fig.show()

#### Well it is now evident why Python is the most popular language and top most recommendation
- Easy to learn, grasp new concepts
- Can apply to almost every possible applications
- A ton of open source libraries available
- Large community
- Ample of tutorials available on platform like YouTube, Coursera, HackerRank, and, etc, for diverse applications.

# Programming Languages used regularly by participant by profression

In [None]:
def get_relation_and_numresp_between_question(
    single_col_qnum: int, 
    multiple_col_qnum: int,
    multiple_col_q_division: str = None,
    index_name: str = "Index Name", 
    x_title: str = "X Axis Title", 
    y_title: str = "Y Axis Title", 
    title1: str = "Title 1", 
    title2: str = "Title 2",
    colorscale: str = "Viridis"
) -> None:
    # Getting all columns names for the multiple choice multiple answer question
    if multiple_col_q_division is not None:
        cols = [
            col 
            for col in survey_df.columns 
            if f"Q{multiple_col_qnum}_{multiple_col_q_division}" in col
        ]
    else:
        cols = [
            col 
            for col in survey_df.columns 
            if f"Q{multiple_col_qnum}" in col
        ]

    # Relations
    rel_df = (
        survey_df.loc[1:,cols]
        .fillna(0)
        .replace(r'[^\d]',1, regex=True)
        .join(survey_df.loc[1:, f"Q{single_col_qnum}"])
    )
    rel_df = round(rel_df.groupby(f"Q{single_col_qnum}").agg('mean')*100,1)
    rel_df.index.name = index_name
    rel_df.columns = [survey_df[col][1:].value_counts().index[0] for col in cols]
    
    # Number of responses
    relc_df = survey_df.loc[1:,cols].join(survey_df.loc[1:,f"Q{single_col_qnum}"])   
    relc_df = relc_df.groupby(f"Q{single_col_qnum}").agg('count')
    relc_df.index.name = index_name
    relc_df.columns = rel_df.columns
    
    # Creating and plotting heatmaps
    ## Heatmap 1
    fig = ff.create_annotated_heatmap(
        rel_df.values.tolist(), 
        x=rel_df.columns.tolist(), 
        y=rel_df.index.tolist(), 
        colorscale=colorscale
    )
    fig.update_layout(
        title = dict(
            text = title1,
            font_size = 20,
        ),
        title_x=0.5,
        xaxis_title = x_title,
        yaxis_title = y_title,
        xaxis={'side': 'bottom'},
    )
    fig.show()
    
    ## Heatmap 2
    fig = ff.create_annotated_heatmap(
        relc_df.values.tolist(), 
        x=rel_df.columns.tolist(), 
        y=rel_df.index.tolist(), 
        colorscale=colorscale
    )
    fig.update_layout(
        title = dict(
            text = title2,
            font_size = 20,
        ),
        title_x=0.5,
        xaxis_title = x_title,
        yaxis_title = y_title,
        xaxis={'side': 'bottom'},
    )
    fig.show()

In [None]:
get_relation_and_numresp_between_question(
    single_col_qnum = 5, 
    multiple_col_qnum = 7,
    multiple_col_q_division = None,
    index_name = "Profession", 
    x_title = "Programming Languages Used", 
    y_title = "Profession", 
    title1 = "Relation between Programming Languages and Profession", 
    title2 = "Number of Responses for a programming language by profession",
    colorscale="YlGnBu"
)

#### Python is super popular for Data Science and ML:
- **Students** - Do start their ML/DS journey and exploration of this vast domain with learning languages namely Python and R, we also proportionate number of responses for SQL, C/C++ as they are part of their coursework. MATLAB is especially used by those who are aiming to do and involve in full-time research and are highly interested in mathematics and related domains.
- **Statistician** - Python and R receive almost equal number of votes followed by SQL and C justified with a simple reason that these folks more into data analysis and statistical research.


# Integrated Development Environments(IDEs)

In [None]:
idf = get_counts_for_question(9, column_names=['IDEs',"Number of Responses"])
idf["% of Responses"] = idf["Number of Responses"]/sum(idf["Number of Responses"])*100

fig = px.bar_polar(
    idf, 
    r = "% of Responses", 
    theta = "IDEs",
    color = "IDEs", 
#     template = "plotly_dark",
    color_discrete_sequence = px.colors.sequential.Plasma_r
)
fig.update_layout(
    title = dict(
        text = "IDEs used on regular Basis",
        font_size = 25,
    ),
    title_x=0.5,
    showlegend=False
)
fig.show()

# Hosted Notebook Products 📓

In [None]:
idf = get_counts_for_question(
    10, column_names=['Hosted Notebook Products',"Number of Responses"]
).sort_values("Number of Responses")
fig = px.scatter(
    idf, 
    y="Hosted Notebook Products", 
    x="Number of Responses", 
    size="Number of Responses",
    color="Number of Responses",
    color_continuous_scale='picnic',
#     template = 'plotly_dark'
)
fig.show()

# Computing Platforms

In [None]:
recommeded_pl_df = pd.DataFrame(
    survey_df.iloc[1:,1:]["Q11"].value_counts(),
).reset_index()
recommeded_pl_df.columns = ["Computing Platform", "Number of Responses"] 
recommeded_pl_df = recommeded_pl_df.sort_values("Number of Responses", ascending=False)

fig = px.bar(
    recommeded_pl_df, 
    x="Number of Responses", 
    y="Computing Platform",
    color='Computing Platform',
    orientation = 'h',
)
fig.update_layout(
    title = dict(
        text = survey_df.iloc[0]["Q11"].split("-")[0].strip(),
        font_size = 17,
    ),
    title_x=0.5,
    yaxis_title = 'Computing Platform',
    xaxis_title = 'Number of Responses',
    showlegend = False
)
fig.show()

In [None]:
recommeded_pl_df = pd.DataFrame(
    survey_df.iloc[1:,1:]["Q26"].value_counts(),
).reset_index()
recommeded_pl_df.columns = ["Money Spent", "Number of Responses"] 
recommeded_pl_df["Money Spent"] = recommeded_pl_df["Money Spent"].replace(
    {
        '$0 ($USD)': "Never",
        '$100-$999': "USD 100 - 999",
        '$1000-$9,999': "USD 1000 - 9999",
        '$1-$99': "USD 1-99",
        '$10,000-$99,999': "USD 10000 - 99999",
        '$100,000 or more ($USD)': "More than USD 100000"
    }
)
recommeded_pl_df["Money Spent"] = recommeded_pl_df["Money Spent"].astype(str)

recommeded_pl_df["% of Responses"] = (
    recommeded_pl_df["Number of Responses"]/sum(recommeded_pl_df["Number of Responses"])
) * 100
recommeded_pl_df = recommeded_pl_df.sort_values("Number of Responses", ascending=False)

fig = px.bar_polar(
    recommeded_pl_df, 
    r = "% of Responses", 
    theta = "Money Spent",
    color = "Money Spent", 
#     template = "plotly_dark",
    color_discrete_sequence = px.colors.sequential.Plasma_r
)
fig.update_layout(
    title = dict(
        text = survey_df.iloc[0]["Q26"].split("-")[0].strip(),
        font_size = 12,
    ),
    title_x=0.5,
    showlegend=False
)
fig.show()

# Specialized Hardware

In [None]:
get_relation_and_numresp_between_question(
    single_col_qnum = 5, 
    multiple_col_qnum = 12,
    multiple_col_q_division = None,
    index_name = "Profession", 
    x_title = "Specialized Hardware", 
    y_title = "Profession", 
    title1 = "Relation between Specialized Hardware and Profession", 
    title2 = "Number of Responses for Specialized Hardware by profession",
    colorscale="YlGnBu"
)

In [None]:
recommeded_pl_df = pd.DataFrame(
    survey_df.iloc[1:,1:]["Q13"].value_counts(),
).reset_index()
recommeded_pl_df.columns = ["TPU Usage", "Number of Responses"] 
recommeded_pl_df = recommeded_pl_df.sort_values("Number of Responses", ascending=False)

fig = px.bar(
    recommeded_pl_df, 
    x="Number of Responses", 
    y="TPU Usage",
    color='TPU Usage',
    orientation = 'h',
)
fig.update_layout(
    title = dict(
        text = survey_df.iloc[0]["Q13"].split("-")[0].strip(),
        font_size = 17,
    ),
    title_x=0.5,
    yaxis_title = 'TPU Usage',
    xaxis_title = 'Number of Responses',
    showlegend = False
)
fig.show()

# Clustering

#### Let's try clustering the data points to find some relations between different features from the survey data.

In [None]:
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
from sklearn.manifold import TSNE


country_enc = LabelEncoder()
prof_enc = LabelEncoder()
gen_enc = LabelEncoder()
edubg_enc = LabelEncoder()

In this survey data, all the inputs are categorical, so we woll find no useful insights from correlation plot or pairs plot. It will simply look like color coded points on grid intersections. Out of all the features only a few features seems good initially to search patterns with respect to compensation recieved by an individual.

In [None]:
def find_mean(x):
    try:
        num = sum([float(i) for i in x])/len(x)
        return num
    except:
        return 0

interested_cols = ['Q1','Q2','Q3','Q4','Q5','Q6','Q15','Q25']
df = survey_df.iloc[1:,1:][interested_cols]
df.columns = ['age','gender','country','edubg','profression','yrofexpc','yrofexpml','compensation']

df['country'] = country_enc.fit_transform(df.country)
df['gender'] = country_enc.fit_transform(df.gender)
df['edubg'] = prof_enc.fit_transform(df.edubg)
df['profression'] = prof_enc.fit_transform(df.profression)

df.compensation = df.compensation.fillna(0).apply(
    lambda x: re.findall(r"[0-9,]+",str(x).replace(',',""))
).apply(find_mean)
df.age = df.age.fillna(0).apply(
    lambda x: re.findall(r"[0-9]+",str(x))
).apply(find_mean)
df.yrofexpc = df.yrofexpc.fillna("0").apply(
    lambda x: re.findall(r"[0-9]+",x)
).apply(find_mean)
df.yrofexpml = df.yrofexpml.fillna("0").apply(
    lambda x: re.findall(r"[0-9]+",x)
).apply(find_mean)

df.head()

In [None]:
import time

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=45, n_iter=1000)
tsne_results = tsne.fit_transform(df)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=tsne_results[:,0], y=tsne_results[:,1],
#     hue="profression",
    palette=sns.color_palette("hls", 15),
    data=df,
    legend="full",
    alpha=0.3
)

In [None]:
from sklearn.cluster import KMeans

def no_of_cluster(df):
    w=[]
    e=[]
    for i in range(1,10):
        k=KMeans(n_clusters=i)
        k.fit_predict(df)
        e.append(k.inertia_)
        w.append(i)
    plt.plot(w,e)
no_of_cluster(tsne_results)

In [None]:
model = KMeans(n_clusters=3, random_state=1)
model.fit(tsne_results)

In [None]:
plt.figure(figsize=(20,7.5)) 
plt.scatter(tsne_results[:,0],tsne_results[:,1],c=model.labels_)

#### Haved experimented with clustering and dimensionality reduction approcahes in the notebook "[Dimensionality Reduction + Clustering - KMLDSS](https://www.kaggle.com/kayvanshah/dimensionality-reduction-clustering-kmldss)"

# Data

In [None]:
survey_df.head(6)

# To be continued...

# THANK YOU