In [1]:
# Basic import
import warnings
warnings.filterwarnings('ignore')

from country_code_file import country_codes
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline 

# Overview
> The goal of this project is to provide a detailed EDA representation of the total number of users per attributes specified. With this project I will break this into sections to ananlyse as much of the dataset.

- Workflow:
    - `Basic Discovery`
    - `Filtering`
    - `Categorical features` EDA
    - `Numerical features` EDA

- toc: true
- badges: true
- comments: true
- categories: [EDA, Pandas]
- image: https://duckduckgo.com/?q=data+science+image&atb=v314-1&iax=images&ia=images&iai=https%3A%2F%2Fnews.southernct.edu%2Fwp-content%2Fuploads%2F2019%2F12%2FDataScience_home.jpg

In [2]:
df = pd.read_csv('./Users_Descriptives.csv')
df.columns = df.columns.str.replace("[^\w\s]", "").str.replace(" ", "_").str.lower()
df.head()

Unnamed: 0,id,city,state,country,gender,profile_length,activitymonthlyduration,highagreement,lowagreement,views,fame,questions,answers,comments,edits,tokens
0,38940,Bydgoszcz,Kujawsko-Pomorskie,Poland,male,44.0,12.53,387.0,68.0,2641.0,24175.0,302.0,52.0,479.0,1046.0,422.0
1,168107,Marysville,Ohio,United States,male,4.0,9.8,77.0,3.0,153.0,1089.0,1.0,32.0,84.0,40.0,23.0
2,168124,Chaville,Île-de-France,France,male,3.0,10.53,55.0,2.0,52.0,743.0,12.0,6.0,10.0,44.0,23.0
3,13,Raleigh,North Carolina,United States,male,179.0,12.79,5212.0,210.0,42264.0,206112.0,10.0,2134.0,3870.0,3842.0,832.0
4,168141,Denver,Colorado,United States,male,1.0,1.72,0.0,0.0,19.0,123.0,4.0,,2.0,12.0,7.0


## Basic Discovery
- In this part of the notebook will just go through getting to know what data we are dealing with before doing in sort of cleaning and plotting.

In [3]:
df.shape

(27030, 16)

**Conclusion**
- We have `3,747,583` rows of data. Since this has a huge quantity let's check out how much `memory` this data is using.
- We see that the machine uses `1,24GB` of memory.

In [4]:
(df
 .memory_usage(deep=True)
 .sum()
)

9134560

In [5]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,27030.0,191818.114539,138829.807954,1.0,53481.25,179061.0,297972.0,432130.0
profile_length,25576.0,14.467,30.570125,1.0,1.0,3.0,15.0,1111.0
activitymonthlyduration,27029.0,9.411596,3.691719,0.0,8.36,10.98,11.84,12.84
highagreement,27029.0,340.932406,1056.045476,0.0,8.0,60.0,272.0,60260.0
lowagreement,27029.0,40.218876,802.867348,0.0,0.0,1.0,8.0,94846.0
views,27029.0,595.706981,5068.626517,0.0,33.0,109.0,341.0,585755.0
fame,27029.0,5932.13689,23795.705914,1.0,169.0,925.0,3728.0,1042979.0
questions,22593.0,22.04643,54.278724,1.0,2.0,7.0,21.0,2539.0
answers,24374.0,78.13338,394.07259,1.0,4.0,14.0,47.0,25827.0
comments,23574.0,157.224782,992.12903,1.0,7.0,23.0,81.0,50468.0


In [6]:
df.dtypes

id                           int64
city                        object
state                       object
country                     object
gender                      object
profile_length             float64
activitymonthlyduration    float64
highagreement              float64
lowagreement               float64
views                      float64
fame                       float64
questions                  float64
answers                    float64
comments                   float64
edits                      float64
tokens                     float64
dtype: object

In [7]:
# Function
def missing_data(data):
    """
    This function calculates missing data for each column in a pandas dataframe.
    It returns a pandas dataframe containing the column name, the number of missing values,
    the percentage of missing values, the data type of the column, and the set of unique values
    for that column.
    
    :param data: A pandas dataframe
    :return: A pandas dataframe
    """
    total = data.isnull().sum()
    percent = round(data.isnull().sum()/data.isnull().count()*100,2)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

def unique_values(data):
    """
    This function takes a pandas dataframe as input and returns a transposed
    pandas dataframe that lists the total number of observations and the number
    of unique values for each column.
    
    Parameters:
    -----------
    data: A pandas dataframe.
    
    Returns:
    --------
    A transposed pandas dataframe that lists the total number of observations
    and the number of unique values for each column.
    """
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [8]:
obj = df.select_dtypes(include="object")
missing_data(obj)

Unnamed: 0,city,state,country,gender
Total,8227,7134,2,1
Percent,30.44,26.39,0.01,0.0
Types,object,object,object,object


In [9]:
num = df.select_dtypes(include="number")
nm = missing_data(num)
nm.transpose()

Unnamed: 0,Total,Percent,Types
id,0,0.0,int64
profile_length,1454,5.38,float64
activitymonthlyduration,1,0.0,float64
highagreement,1,0.0,float64
lowagreement,1,0.0,float64
views,1,0.0,float64
fame,1,0.0,float64
questions,4437,16.42,float64
answers,2656,9.83,float64
comments,3456,12.79,float64


**Conclusion**
- The `city` column contains has the highest number of missing values with 31%
- But since all the categorical columns have low percentages we can just drop them

In [10]:
nan_df = df[df.isna().any(axis=1)]
nan_df

Unnamed: 0,id,city,state,country,gender,profile_length,activitymonthlyduration,highagreement,lowagreement,views,fame,questions,answers,comments,edits,tokens
4,168141,Denver,Colorado,United States,male,1.0,1.72,0.0,0.0,19.0,123.0,4.0,,2.0,12.0,7.0
6,227169,,,Viet Nam,male,1.0,6.01,0.0,0.0,16.0,75.0,3.0,,2.0,10.0,8.0
7,103814,,Arizona,United States,male,1.0,10.85,136.0,1.0,153.0,2212.0,18.0,17.0,57.0,90.0,65.0
8,168177,,,Portugal,male,3.0,11.75,1218.0,72.0,1508.0,9912.0,95.0,219.0,848.0,638.0,170.0
9,30,,,United States,male,8.0,12.83,146.0,8.0,1522.0,11427.0,11.0,65.0,65.0,153.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27020,63928,Minneapolis,Minnesota,United States,female,9.0,0.10,0.0,0.0,35.0,31.0,,,,,2.0
27023,63985,Dublin,Leinster,Ireland,male,3.0,12.68,82.0,8.0,136.0,1273.0,,29.0,8.0,29.0,18.0
27025,128654,,,Norway,male,6.0,11.93,104.0,2.0,149.0,1772.0,9.0,7.0,31.0,50.0,37.0
27026,128661,,,Germany,male,8.0,11.49,241.0,7.0,507.0,3827.0,2.0,47.0,79.0,102.0,59.0


In [11]:
def clean(data):
    # Categorical
    cols = ['city', 'state', 'country', 'gender']
    for objs in cols:
        data = data.dropna(subset=cols)
        
    # Numerical
    numerical = data.select_dtypes(include="number")
    for num in numerical.columns:
        data[num].fillna(data[num].median(), inplace=True)
#         data[num].fillna(0, inplace=True)
    
    data["city"] = data['city'].astype('category')
    data["state"] = data['state'].astype('category')
    data["country"] = data['country'].astype('category')
    data["gender"] = data['gender'].astype('category')
    return data

In [12]:
user_d = clean(df)
md = missing_data(user_d)
md.transpose()

Unnamed: 0,Total,Percent,Types
id,0,0.0,int64
city,0,0.0,category
state,0,0.0,category
country,0,0.0,category
gender,0,0.0,category
profile_length,0,0.0,float64
activitymonthlyduration,0,0.0,float64
highagreement,0,0.0,float64
lowagreement,0,0.0,float64
views,0,0.0,float64


## Filtering
**QUESTIONS**
- What is the average and sum of variables by gender.
- What is the average and sum of variables by country.
- Which gender per country as the most views, fame, questions

In [13]:
# Average and sum of variables by gender.
(user_d
 .groupby("gender")
 .agg({"questions": ["mean", "sum"],
       "answers": ["mean", "sum"],
       "comments": ["mean", "sum"],
       "activitymonthlyduration": ["mean", "sum"]
      })
)

Unnamed: 0_level_0,questions,questions,answers,answers,comments,comments,activitymonthlyduration,activitymonthlyduration
Unnamed: 0_level_1,mean,sum,mean,sum,mean,sum,mean,sum
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,18.073489,33201.0,58.049537,106637.0,114.124115,209646.0,9.108519,16732.35
male,19.156781,312313.0,72.173894,1176651.0,137.562964,2242689.0,9.634311,157068.17


**Conclusion**
- Clearly males are the ones with the highest average and sum
- Let's see which country reflects this

In [14]:
# Average and sum of variables by country.
user_d = user_d.replace({'United States':'US', 'United Kingdom':'UK', 'South Africa':'SA'})
user_country_code = user_d[user_d["country"].isin(["US", "UK", "India", "Germany", "SA",
                                                   "Canada", "Australia", "France"])]
(user_country_code
 .groupby("country")
 .agg({"questions": ["mean", "sum"],
       "answers": ["mean", "sum"],
       "views": ["mean", "sum"],
       "fame": ["mean", "sum"]
      })
)

Unnamed: 0_level_0,questions,questions,answers,answers,views,views,fame,fame
Unnamed: 0_level_1,mean,sum,mean,sum,mean,sum,mean,sum
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Australia,19.497976,14448.0,78.476383,58151.0,521.676113,386562.0,5563.132254,4122281.0
Canada,19.007619,19958.0,59.100952,62056.0,430.753333,452291.0,5443.321905,5715488.0
France,17.044728,5335.0,72.610224,22727.0,471.498403,147579.0,5257.386581,1645562.0
Germany,20.676152,15259.0,115.822493,85477.0,830.651762,613021.0,9568.598916,7061626.0
India,22.0,20460.0,39.550538,36782.0,322.197849,299644.0,3317.529032,3085302.0
SA,15.066667,1356.0,33.677778,3031.0,319.933333,28794.0,2660.055556,239405.0
UK,22.278204,37717.0,105.061429,177869.0,704.119905,1192075.0,7853.476669,13295936.0
US,18.188064,147196.0,68.508093,554436.0,672.393674,5441682.0,5980.821945,48402792.0


**Conclusion**
- The `UK` reflects the highest average and sum
- `Germany` follows in second

In [15]:
# Gender per country with views
user_group = user_d[user_d["country"].isin(["US", "UK", "India", "Germany", "SA",
                                           "Canada", "Australia", "France"])]
(user_group
   .groupby(['gender','country'])
   .views
   .first()
   .unstack()
)

country,Australia,Canada,France,Germany,India,SA,UK,US
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,122.0,180.0,34.0,114.0,9.0,1124.0,59.0,1141.0
male,293.0,267.0,52.0,93.0,162.0,227.0,897.0,153.0


In [16]:
# gender per country with fame
user_d = user_d.replace({'United States':'US', 'United Kingdom':'UK', 'South Africa':'SA'})
user_group = user_d[user_d["country"].isin(["US", "UK", "India", "Germany", "SA",
                                           "Canada", "Australia", "France"])]
(user_group
   .groupby(['gender','country'])
   .fame
   .first()
   .unstack()
)

country,Australia,Canada,France,Germany,India,SA,UK,US
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,1255.0,6028.0,271.0,1314.0,13.0,12127.0,29.0,3189.0
male,4439.0,1450.0,743.0,738.0,621.0,2135.0,11213.0,1089.0


In [17]:
# gender per country with questions
user_d = user_d.replace({'United States':'US', 'United Kingdom':'UK', 'South Africa':'SA'})
user_group = user_d[user_d["country"].isin(["US", "UK", "India", "Germany", "SA",
                                           "Canada", "Australia", "France"])]
(user_group
   .groupby(['gender','country'])
   .questions
   .first()
   .unstack()
)

country,Australia,Canada,France,Germany,India,SA,UK,US
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,6.0,3.0,7.0,29.0,2.0,105.0,6.0,17.0
male,26.0,17.0,12.0,5.0,6.0,7.0,133.0,1.0


## EDA  Analysis

### Categorical
**QUESTIONS**
- What is the top countries with high fame by gender?
- What is the top cities with high questions?
- What is the total fame by country?
- `World Map`: Total questions, answers, edits, monthly duration by country

In [18]:
pie_gender_views = (user_d
 .groupby('gender', as_index=False)
 .sum()
 .sort_values(by='views', ascending=False)
)
def pie_chart(data, col, val):
    fig = px.pie(data, values=val, names=col,
            color_discrete_sequence=['#14213d', '#9d0208'],
            title=f'Total {val} by {col}',
            hole=.6)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.layout.template = 'plotly_dark'
    return fig.show()
pie_chart(pie_gender_views, 'gender', 'views')

In [19]:
# Histogram Gender
gender_hist = user_d.replace('male', 'Males').replace('female', 'Females')
for gen in gender_hist['gender'].unique():
    dataframe = gender_hist.loc[gender_hist['gender'] == gen].groupby('country')['fame'].agg('sum').sort_values(ascending=False)[:10]
    dataframe = pd.DataFrame({'Country':dataframe.index, 'Fame':dataframe.values})
    fig = px.histogram(dataframe, x="Fame", y='Country', color='Country',
                       title=f'Top 10 countries with fame by {gen}')
    fig.layout.template = 'plotly_dark'
    fig.update_traces(opacity=0.7)
    fig.show()

**Coclusion**
- For `males`: US as expected came first but interestingly we have `UK second`. We can get a closer look at which city in the UK carry most of fame.
- For `females`: US again first, but `India came in second` we is the opposite of the males. We will also look at which city in India carry the most fame.

In [20]:
india_female_city = user_d[(user_d['gender']=='female') & (user_d['country']=='India')]
(india_female_city['city']
 .value_counts()
 .reset_index()
 .rename(columns={'index':'city', 'city':'count'})[:7]
)

Unnamed: 0,city,count
0,Bengaluru,34
1,Chennai,19
2,Mumbai,15
3,Hyderabad,11
4,Pune,9
5,Ahmedabad,7
6,Kochi,3


In [21]:
uk_male_city = user_d[(user_d['gender']=='male') & (user_d['country']=='UK')]
(uk_male_city['city']
 .value_counts()
 .reset_index()
 .rename(columns={'index':'city', 'city':'count'})[:7]
)

Unnamed: 0,city,count
0,London,734
1,Cambridge,57
2,Manchester,47
3,Edinburgh,45
4,Bristol,42
5,Glasgow,35
6,Oxford,32


In [22]:
# Histogram Countries
dataframe = user_d.groupby('state').agg('sum').sort_values(by='views', ascending=False)[:10]
fig = px.histogram(dataframe, x="views", y=dataframe.index, color=dataframe.index,
                   title=f'Top 10 states with high views')
fig.layout.template = 'plotly_dark'
fig.update_traces(opacity=0.7)
fig.show()

In [23]:
# Histogram Cities
unq_variant = user_d[user_d["country"].isin(["US", "UK", "India", "Germany"])]
dataframe = unq_variant.groupby('city').agg('sum').sort_values(by='questions', ascending=False)[:10]
fig = px.histogram(dataframe, x="questions", y=dataframe.index, color=dataframe.index,
                   title=f'Top 10 cities with high questions',
                   labels={'sum of questions':'questions'})
fig.layout.template = 'plotly_dark'
fig.update_traces(opacity=0.7)
fig.show()

In [24]:
def scatter_world_map(data, category, size):
    user_category = data.groupby(category, as_index=False).sum()
    user_category['Code'] = user_category[category].map(country_codes)
    
    # Plot
    map_fig = px.scatter_geo(user_category,
                     locations = 'Code',
                     projection = 'orthographic',
                     title=f'Total {size} by country',
                     color = category,
                     opacity=.7,
                     size=size,
                     hover_name=category,
                     hover_data=['views', 'questions', 'activitymonthlyduration', 'fame']
    )
    map_fig.layout.template = 'plotly_dark'
    return map_fig.show()

In [25]:
scatter_world_map(user_d, 'country', 'views')

In [26]:
scatter_world_map(user_d, 'country', 'questions')

In [27]:
scatter_world_map(user_d, 'country', 'fame')

### Numerical
**QUESTIONS**
- Which city and gender have the highest views?
- Which country has the lowest fame?

In [28]:
def correlation(data):
    corrmat = data.corr()
    fig = px.imshow(corrmat, color_continuous_scale=["#caf0f8","#03071e", "#370617", "#70e000"],
                    title='Heatmap User Description')
    fig.layout.template = 'plotly_dark'
    return fig.show()

correlation(user_d)

#### Views

In [29]:
def numerical_histogram(data, x, color):
    fig = px.histogram(data, x=x,
                      title=f'Histogram of {x}',
                      color_discrete_sequence=[color])
    fig.layout.template = 'plotly_dark'
    return fig.show()

In [30]:
view_zero = user_d[user_d['views']<150]
numerical_histogram(view_zero, 'views', 'rosybrown')

**Conclusion**
- Views is right skewed
- There is a lot of data and `outliers in views`, which makes our data `skew`. So we will lower the amount of views in our data because it is clear that less people have more views.

In [31]:
one_million = user_d[user_d['views']>100000]
(one_million['state']
 .value_counts()
 .reset_index()[:10]
 .rename(columns={'index':'state', 'state':'count'})
)

Unnamed: 0,state,count
0,North Carolina,1
1,California,1
2,South Denmark,1
3,New York,1
4,Canterbury,1
5,SG.01,1
6,Primorsko-Goranska,0
7,Plzeňský kraj,0
8,Pomerania,0
9,Ponce,0


In [32]:
user_d[user_d['views']>500000]

Unnamed: 0,id,city,state,country,gender,profile_length,activitymonthlyduration,highagreement,lowagreement,views,fame,questions,answers,comments,edits,tokens
489,1,El Cerrito,California,US,male,64.0,12.81,3390.0,1311.0,585755.0,60897.0,16.0,127.0,382.0,5066.0,343.0


In [33]:
user_d[user_d['views']<5][:10]

Unnamed: 0,id,city,state,country,gender,profile_length,activitymonthlyduration,highagreement,lowagreement,views,fame,questions,answers,comments,edits,tokens
81,351694,Chennai,Tamil Nadu,India,male,1.0,0.0,0.0,0.0,2.0,3.0,2.0,14.0,1.0,6.0,2.0
88,227707,Hong Kong,Central and Western,Hong Kong,male,1.0,2.24,0.0,0.0,2.0,1.0,7.0,14.0,24.0,43.0,1.0
140,287518,Paris,Île-de-France,France,male,1.0,8.65,1.0,0.0,0.0,26.0,7.0,1.0,24.0,1.0,2.0
204,412145,Montreal,Quebec,Canada,male,1.0,1.97,0.0,0.0,0.0,1.0,7.0,14.0,24.0,43.0,27.0
241,352549,Dalian,Liaoning,China,male,3.0,0.06,0.0,0.0,4.0,11.0,1.0,14.0,1.0,6.0,3.0
274,228509,Atlanta,Georgia,US,male,4.0,1.63,0.0,0.0,4.0,23.0,1.0,14.0,2.0,3.0,6.0
293,170049,Berlin,Berlin,Germany,male,1.0,0.37,4.0,0.0,3.0,36.0,7.0,2.0,24.0,3.0,3.0
345,170466,Darmstadt,Hesse,Germany,male,1.0,0.29,1.0,0.0,3.0,21.0,7.0,1.0,24.0,2.0,3.0
375,229026,Montreal,Quebec,Canada,male,1.0,0.0,0.0,0.0,2.0,1.0,1.0,14.0,24.0,3.0,27.0
400,170764,Leesburg,Virginia,US,female,4.0,9.08,0.0,0.0,4.0,5.0,2.0,14.0,3.0,6.0,3.0


In [34]:
def categorical_feature(x):
    if x < 35:
        return 'Low Views'
    elif x < 70:
        return 'Some Views'
    elif x < 105:
        return 'Medium Views'
    elif pd.isnull(x):
        return np.nan
    else:
        return 'High Views'

In [35]:
view_zero['view_feature'] = view_zero['views'].map(categorical_feature)
view_zero[['country','city','gender','views', 'view_feature']]

Unnamed: 0,country,city,gender,views,view_feature
2,France,Chaville,male,52.0,Some Views
4,US,Denver,male,19.0,Low Views
13,US,Pittsfield,male,35.0,Some Views
14,New Zealand,Auckland,male,98.0,Medium Views
22,US,Chicago,male,9.0,Low Views
...,...,...,...,...,...
27012,France,Nantes,male,7.0,Low Views
27017,US,Chicago,male,1.0,Low Views
27020,US,Minneapolis,female,35.0,Some Views
27023,Ireland,Dublin,male,136.0,High Views


In [36]:
hist_categorical_views = (view_zero['view_feature']
 .value_counts()
 .reset_index()
 .rename(columns=({'index': 'views', 'view_feature': 'count'}))
)
fig = px.histogram(hist_categorical_views, x='count', y='views',
                      title='Histogram of View Feature',
                      color='views',
                      color_discrete_sequence=['#005f73', '#ee9b00', '#ae2012', '#9b2226'])
fig.layout.template = 'plotly_dark'
fig.show()

In [37]:
view_zero[view_zero['view_feature']=='High Views'].sort_values(by='views', ascending=False)[:10]

Unnamed: 0,id,city,state,country,gender,profile_length,activitymonthlyduration,highagreement,lowagreement,views,fame,questions,answers,comments,edits,tokens,view_feature
4096,351975,Barcelona,Catalonia,Spain,male,45.0,11.01,92.0,1.0,149.0,2113.0,31.0,34.0,80.0,161.0,62.0,High Views
6933,44605,Austin,Texas,US,male,4.0,12.47,85.0,18.0,149.0,1275.0,6.0,49.0,151.0,81.0,35.0,High Views
14476,51059,Reykjavík,Capital Region,Iceland,male,5.0,12.25,10.0,0.0,149.0,725.0,4.0,2.0,15.0,23.0,22.0,High Views
21683,296696,New York,New York,US,male,1.0,9.21,28.0,1.0,149.0,1466.0,1.0,24.0,21.0,56.0,21.0,High Views
4321,105667,Austin,Texas,US,male,90.0,11.95,72.0,0.0,149.0,821.0,4.0,14.0,17.0,43.0,23.0,High Views
4267,40498,Delhi,Delhi,India,male,1.0,10.57,13.0,0.0,149.0,593.0,12.0,2.0,13.0,39.0,25.0,High Views
22155,53155,New York,New York,US,male,26.0,10.18,39.0,0.0,149.0,812.0,7.0,38.0,15.0,52.0,18.0,High Views
21395,4341,Melbourne,Victoria,Australia,female,22.0,12.74,174.0,1.0,149.0,1434.0,6.0,12.0,9.0,35.0,37.0,High Views
11023,290844,Nashville,Tennessee,US,male,1.0,10.52,331.0,0.0,149.0,1532.0,44.0,12.0,87.0,161.0,55.0,High Views
14787,177040,Bengaluru,Karnataka,India,male,64.0,11.69,19.0,0.0,149.0,626.0,43.0,5.0,73.0,143.0,34.0,High Views


In [38]:
view_zero['country'].value_counts().reset_index()[:10]

Unnamed: 0,index,country
0,US,4634
1,UK,881
2,Canada,615
3,India,583
4,Australia,403
5,Germany,354
6,France,185
7,Sweden,175
8,Russian Federation,173
9,Netherlands,140


#### monthly duration

In [39]:
user_d['activitymonthlyduration'].value_counts()

11.45    183
11.44    173
10.99    168
10.78    168
10.79    157
        ... 
3.75       1
3.68       1
4.64       1
4.67       1
3.65       1
Name: activitymonthlyduration, Length: 1271, dtype: int64

In [40]:
amd = user_d[(user_d['activitymonthlyduration']>0.05) & (user_d['activitymonthlyduration']<12.00)]

**Conclusion**
- There is a lot of people with less than a month so we can remove some just to see our data better

In [41]:
(amd['activitymonthlyduration']
 .value_counts()
 .reset_index()[:10]
 .rename(columns={'index':'monthly_duration', 'activitymonthlyduration':'count'})
)

Unnamed: 0,monthly_duration,count
0,11.45,183
1,11.44,173
2,10.78,168
3,10.99,168
4,10.79,157
5,11.21,153
6,11.68,152
7,10.98,150
8,10.8,148
9,10.81,147


In [42]:
numerical_histogram(amd, 'activitymonthlyduration', 'saddlebrown')

In [43]:
def categorical_feature(x):
    if x < 1:
        return 0
    elif x < 2:
        return 1
    elif x < 3:
        return 2
    elif x < 4:
        return 3
    elif x < 5:
        return 4
    elif x < 6:
        return 5
    elif x < 7:
        return 6
    elif x < 8:
        return 7
    elif x < 9:
        return 8
    elif x < 10:
        return 9
    elif x < 11:
        return 10
    elif x < 12:
        return 11
    else:
        return 12
    
amd['monthly_feature'] = amd['activitymonthlyduration'].map(categorical_feature)

In [44]:
amd[['country','city','gender','activitymonthlyduration', 'monthly_feature']]

Unnamed: 0,country,city,gender,activitymonthlyduration,monthly_feature
1,US,Marysville,male,9.80,9
2,France,Chaville,male,10.53,10
4,US,Denver,male,1.72,1
5,US,New York,male,11.74,11
11,Australia,Brisbane,male,10.95,10
...,...,...,...,...,...
27020,US,Minneapolis,female,0.10,0
27021,US,San Francisco,male,10.45,10
27022,France,Saint-Affrique,male,11.93,11
27024,India,Ahmedabad,male,11.93,11


In [45]:
hist_categorical_monthly = (amd['monthly_feature']
 .value_counts()
 .reset_index()
 .rename(columns=({'index': 'monthly', 'monthly_feature': 'count'}))
)
fig = px.histogram(hist_categorical_monthly, x='monthly', y='count',
                   title='Histogram of View Feature',
                   color_discrete_sequence=['saddlebrown'],
                   nbins=35
                  )
fig.layout.template = 'plotly_dark'
fig.show()

#### fame

In [46]:
fame_data = user_d[user_d['fame']>1]

In [47]:
(user_d['fame']
 .value_counts()
 .reset_index()[:10]
 .rename(columns={'index':'fame', 'fame':'count'})
)

Unnamed: 0,fame,count
0,1.0,527
1,11.0,284
2,21.0,233
3,31.0,166
4,41.0,123
5,101.0,116
6,43.0,85
7,51.0,84
8,61.0,82
9,23.0,79


In [48]:
user_fame = user_d[(user_d['fame']>1)& (user_d['fame']<250)]

In [49]:
numerical_histogram(user_fame, 'fame', 'sienna')

In [50]:
def categorical_feature(x):
    if x <= 60:
        return 'Beginner'
    elif x <=120:
        return 'Medium'
    elif x <= 180:
        return 'Intermediate'
    else:
        return 'Famous'

In [51]:
user_fame['fame_feature'] = user_fame['fame'].map(categorical_feature)
user_fame[['country','city','gender','fame', 'fame_feature']]

Unnamed: 0,country,city,gender,fame,fame_feature
4,US,Denver,male,123.0,Intermediate
13,US,Pittsfield,male,53.0,Beginner
22,US,Chicago,male,45.0,Beginner
28,US,Portland,male,45.0,Beginner
29,Canada,Vancouver,male,46.0,Beginner
...,...,...,...,...,...
26978,UK,York,male,171.0,Intermediate
27004,US,Hoboken,male,13.0,Beginner
27010,Denmark,Copenhagen,male,21.0,Beginner
27012,France,Nantes,male,118.0,Medium


In [52]:
user_fame[user_fame['fame_feature']=='Famous']

Unnamed: 0,id,city,state,country,gender,profile_length,activitymonthlyduration,highagreement,lowagreement,views,fame,questions,answers,comments,edits,tokens,fame_feature
57,227471,Arlington,Texas,US,male,1.0,7.70,8.0,0.0,45.0,214.0,8.0,5.0,8.0,29.0,17.0,Famous
223,40108,Bengaluru,Karnataka,India,male,3.0,5.43,0.0,0.0,90.0,208.0,1.0,9.0,6.0,14.0,7.0,Famous
430,815,Seattle,Washington,US,male,1.0,2.66,17.0,0.0,52.0,202.0,5.0,1.0,1.0,19.0,16.0,Famous
437,106173,Oslo,Oslo,Norway,male,1.0,8.35,0.0,0.0,31.0,183.0,2.0,14.0,2.0,6.0,6.0,Famous
450,413515,Al Madinah Al Munawwarah,Medina Region,Saudi Arabia,male,4.0,9.72,41.0,0.0,77.0,249.0,26.0,1.0,31.0,82.0,19.0,Famous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26508,242815,Waterloo,Ontario,Canada,male,1.0,5.01,2.0,0.0,12.0,216.0,1.0,1.0,2.0,4.0,4.0,Famous
26670,426866,Seattle,Washington,US,male,1.0,10.72,32.0,1.0,21.0,184.0,3.0,8.0,9.0,21.0,10.0,Famous
26738,186125,Vancouver,British Columbia,Canada,male,1.0,7.82,32.0,0.0,27.0,186.0,4.0,14.0,13.0,32.0,9.0,Famous
26876,122953,Tromsø,Troms og Finnmark,Norway,male,1.0,11.96,1.0,0.0,32.0,203.0,2.0,6.0,2.0,12.0,7.0,Famous


In [53]:
hist_categorical_fame = (user_fame['fame_feature']
 .value_counts()
 .reset_index()
 .rename(columns=({'index': 'fame', 'fame_feature': 'count'}))
)
fig = px.histogram(hist_categorical_fame, x='count', y='fame',
                      title='Histogram of Fame Feature',
                      color='fame',
                      color_discrete_sequence=['#007f5f', '#2b9348', '#55a630', '#80b918'])
fig.layout.template = 'plotly_dark'
fig.show()