In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

![](https://3nions.com/wp-content/uploads/2020/01/comp_1.gif)

Social media usage, specifically Twitter, can be a very robust tool to a successful political campaign. In this very high level overview dataset, we will be exploring variables such as account age and whether or not the politicians have an Instagram account or not. Furthermore, we will graph our findings in order to make sense of it all.  

# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud, STOPWORDS

# Import dataset

In [None]:
df = pd.read_csv('/kaggle/input/us-politicians-twitter-dataset/dataset.csv')

# High level overview

In [None]:
print(f'Our dataset has {df.shape[0]} rows and {df.shape[1]} columns')

In [None]:
df.info()

In [None]:
df.isnull().sum().sort_values(ascending= False)/len(df)

We notice that the Instagram_username column has a large amount of NaN values (above 50%). We will have to decide what do with them later in the code. 

# Remove unnecessary columns

In [None]:
df = df.drop(columns= ['Account_ID'])

# Feature engineering

Lets perform some feature engineering before we dive into our analysis. I will be creating a new column for:
1. Account_age (twitter account)
2. Instagram_account (boolean—whether they have an account or not)

First lets convert the time columns to a dtype that we can work with:

In [None]:
df['Account_start_time'] = pd.to_datetime(df['Account_start_time'])
df['Account_age'] = 2020 - df['Account_start_time'].dt.year

In [None]:
df['Instagram_account'] = np.where(df['Instagram_username'].isnull(), False, True)

# Univariate analysis

### Sex

In [None]:
a = df.Sex.value_counts().reset_index()
a.columns = ['sex', 'count']
fig = px.bar(a, y='sex', x='count', color= 'sex', title= 'Politician sex')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
print(df.Sex.value_counts(normalize= True))

We can see that 70% of the politicians in this dataset are male, 29% are female, and under 1% are transgender female or male

### Birthplace

In [None]:
b = df.Birthplace.value_counts().reset_index()
b.columns = ['birthplace', 'count']
b = b.loc[b.birthplace == 'United States of America', :]

df2 = df[df['Birthplace']!= 'United States of America']
b2 = df2.Birthplace.value_counts().reset_index()
b2.columns = ['birthplace', 'count']

fig = make_subplots(rows=1, cols=2, subplot_titles= ('USA Birthplace count', 'Birthplace count (excluding USA)'))

fig.add_trace(go.Bar(y= b['count'], x= b['birthplace'], marker=dict(color=b['count'], coloraxis="coloraxis")), row=1, col=1)
fig.add_trace(go.Bar(y= b2['count'], x= b2['birthplace'], marker=dict(color=b2['count'], coloraxis="coloraxis")), row=1, col=2)

fig.update_layout(coloraxis=dict(colorscale='fall'), showlegend=False)
fig.show()

In [None]:
print(f'The United States birthplace makes up {round(df.Birthplace.value_counts(normalize= True), 4)[0]*100}% of this dataset')

### Political party association

In [None]:
c = df.loc[(df['Political_party'] == 'Democratic Party') | (df['Political_party'] == 'Republican Party'), :]
c = c.Political_party.value_counts().reset_index()
c.columns = ['party', 'count']

c2 = df.loc[(df['Political_party'] != 'Democratic Party') & (df['Political_party'] != 'Republican Party'), :]
c2 = c2.Political_party.value_counts().reset_index()
c2.columns = ['party', 'count']


fig = make_subplots(rows=1, cols=2, subplot_titles= ('Democrat and Republican count', 'Other parties count (excluding Dem. and Rep.)'))

fig.add_trace(go.Bar(y= c['count'], x= c['party'], marker=dict(color=c['count'], coloraxis="coloraxis")), row=1, col=1)
fig.add_trace(go.Bar(y= c2['count'], x= c2['party'], marker=dict(color=c2['count'], coloraxis="coloraxis")), row=1, col=2)

fig.update_layout(coloraxis=dict(colorscale='fall'), showlegend=False)
fig.show()

Since majority of our dataset is broken down into 'Democrat' and 'Republican'—we can transform all other parties to 'other' if we feel like it'll help our analysis/understanding of the data. I'm going to combine the political parties as: Democrat, Republican, and other just for brevity

In [None]:
conditions = [
    df['Political_party'] == 'Democratic Party',
    df['Political_party'] == 'Republican Party'
]

values = [
    'Democrat', 
    'Republican'
]

df['Political_party'] = np.select(conditions, values, default= 'other')

### Age distribution

In [None]:
px.histogram(df, df['Age'])

In [None]:
print(f'The mean age of a politician in this dataset is {round(df.Age.mean())} years old ')

### Instagram account

In [None]:
px.pie(df, df['Instagram_account'], hole= .3, title='Instagram account?')

# Multivariate analysis

Lets mainly explore these categorical variables:

1. Sex
2. Political party
3. Instagram_account 

In [None]:
px.histogram(df, x='Age', color='Political_party')

In [None]:
e = df.groupby(['Political_party', 'Sex']).median().reset_index()
px.box(df, x='Political_party', y='Age', color= 'Sex', points="all")

In [None]:
e2 = df.groupby(['Political_party', 'Sex']).mean().reset_index()
px.sunburst(e2, path=['Political_party', 'Sex'], values='Age', title= 'Sunburst plot for politician age')

In [None]:
px.scatter(df, x='Age', y='Account_age', color= 'Political_party', trendline= 'ols', marginal_y='histogram',
          title= 'Age vs account age')

Does not seem to be any strong correlation between age and age of twitter account

In [None]:
f = df.groupby(['Political_party', 'Instagram_account']).mean().reset_index()
px.bar(f, y='Political_party', x='Age', color= 'Instagram_account', barmode= 'group')

In [None]:
f2 = df.groupby(['Sex', 'Instagram_account']).mean().reset_index()
px.bar(f2, y='Sex', x='Age', color= 'Instagram_account', barmode= 'group')

This confirms the notion that younger people gravitate towards instagram... It makes sense that the average age of politicians who have an account/use instagram is younger than those who do not. Also, we notice females have the lowest average age when it comes to having an Instagram account. 

In [None]:
test = df[['Sex', 'Age', 'Political_party', 'Instagram_account']]
px.parallel_categories(test, title= 'Parallel categories chart')

# Feature engineering part II

1. Does Twitter handle match Instagram handle?
2. First name counts

In [None]:
df['Twitter_username'] = df['Twitter_username'].apply(lambda x: x.lower())
df['match'] = np.where(df['Twitter_username'] == df['Instagram_username'], True, False)

In [None]:
df['fname'] = df['Name'].apply(lambda x: x.split(' ')[0])

In [None]:
df_male = df.loc[df['Sex'] == 'male', :]
df_female = df.loc[df['Sex'] == 'female', :]

male = df_male.fname.value_counts().head(15).reset_index()
male.columns = ['fname', 'count']


female = df_female.fname.value_counts().head(15).reset_index()
female.columns = ['fname', 'count']

fig = make_subplots(rows=2, cols=1, subplot_titles= ('Top 15 male name occurence', 'Top 15 female name occurence'))

fig.add_trace(go.Bar(x= male['fname'], y= male['count'], marker=dict(color=male['count'], coloraxis="coloraxis")), row=1, col=1)
fig.add_trace(go.Bar(x= female['fname'], y= female['count'], marker=dict(color=female['count'], coloraxis="coloraxis")), row=2, col=1)

fig.update_layout(coloraxis=dict(colorscale='fall'), showlegend=False)
fig.show()

In [None]:
px.pie(df, df['match'], title= 'Does Twitter handle match Instagram handle?', hole= .3)

# Conclusion

Further analysis can be performed, however I think we have exhausted this dataset. It would be interesting to add statistics such as: follwers, following, likes to the dataset to get a better understanding how these variables affect things like age of politician/account. If you found this at all helpful/useful I'd appreciate a 👍