In [None]:
#!/usr/bin/env python
# coding: utf-8

below is from https://medium.com/analytics-vidhya/exploratory-data-analysis-for-beginner-7488d587f1ec

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk

In [None]:
sns.set(style="ticks")
# style = 'dark','darkgrid','whitegrid' are some other styles
filename = 'hydrated_clean.csv'
directory = os.path.join('data',filename)
hydrated = pd.read_csv(directory, dtype='unicode')

Tweeter counts over time

In [None]:
hydrated = hydrated.sort_values(by=['created_at'])
weekly_tweet = hydrated.resample('w',on='created_at').count()
weekly_tweet.index = weekly_tweet.index.date

In [None]:
ax = weekly_tweet.plot(kind='bar',y='letter_id_str',figsize=(8,5))
ax.set_xlabel('Date - Week Starting')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Establish a dataframe based on users

In [None]:
df_by_user = hydrated.groupby('from_user')
user_dict = {}
for username, sub_df in df_by_user:
    sub_df.sort_values('created_at')
    data = {}
    data["tweets_count"] = sub_df["text"].count()
    data["user_created_at"] = pd.to_datetime(sub_df["user_created_at"][sub_df["user_created_at"].index[0]]).date()
    data["user_verified"] = sub_df["user_verified"].notnull().any()
    user_dict[username]=data

In [None]:
new_df = pd.DataFrame.from_dict(user_dict,'index')
new_df.to_csv(os.path.join("data","user_stats.csv"))
new_df = new_df.sort_values('tweets_count',ascending=False)
new_df = new_df.sort_values('user_created_at')
new_df = new_df.sort_values("user_verified")
new_df.user_created_at = pd.DatetimeIndex(new_df.user_created_at).to_period('M')
month_group = new_df.groupby('user_created_at')

In [None]:
for month,sub_df in month_group:
    print(month," --> ",sub_df.user_verified.value_counts(normalize=True)*100)

In [None]:
new_df.value_counts('user_verified')

To investigate verified individuals:<br>
Step 1. Scrapping all user_description and make it a corpus.<br>
Step 2. Remove the stop words and count the TFs<br>
Step 3. Show stats of word count, choose top 50?<br>
Step 4. Arbitrarily define two brackets of words to distinguish between Media and Individuals<br>
Step 5. Run it through all verified users

In [None]:
corpus_user_description_list = hydrated.user_description.unique().tolist()

In [None]:
df_verified = hydrated[hydrated.user_verified == 'TRUE']
user_des_list = df_verified.user_description.unique().tolist()
# Test Field on a single user (realBenTalks)
ben = hydrated[hydrated.from_user=='realBenTalks']
ben = ben.sort_values(by=['created_at'])

A plot on how many tweets Ben tweeted each week:

In [None]:
ben = ben.set_index('created_at')
weekly_tweet = ben.resample('w').count().reset_index()
weekly_tweet.created_at = weekly_tweet.created_at.dt.date

In [None]:
ax = weekly_tweet.plot(kind='bar',x='created_at',y='letter_id_str',figsize=(8,5))
ax.set_xlabel('Date - Week Starting')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Here .astype() will cast all df object as "true", instead we can use "==" to match if the content is true<br>
hydrated['user_verified'] = hydrated['user_verified'].astype('bool')

In [None]:
hydrated['user_verified'] = hydrated['user_verified'] == "True"

Don't seem to need this, if we want the sum of the verified user we can count df directly<br>
print(hydrated.isnull().values.sum())

In [None]:
hydrated['user_verified'].value_counts()

Test if users are always verified:

In [None]:
cnt_true = 0
cnt_false =0
cnt_mix = 0
error = 0
sumcount = -1
for index,data in hydrated.groupby(['from_user']):
    if sumcount ==0 :
        break
    sumcount-=1
    if data.user_verified.value_counts().count()>1:
        cnt_mix+=1
        #print(data.user_verified.value_counts())
       # print(data.user_verified.values)
    elif data.user_verified.values[0]== True:
        cnt_true+=1
       # print(data.user_verified.value_counts())
        #print(data.user_verified.values)
    else:
        cnt_false+=1

In [None]:
print({'Verified:':cnt_true,"Not Verified":cnt_false,"Verified in between:":cnt_mix,"Anormaly:":error})
# We can also group it by selecting id_str or letter_id_str to see how many unique entires
hydrated.groupby('user_verified')[['id_str', 'letter_id_str']].nunique()

seaborn histogram 

not really sure what categories will be useful/possible here

In [None]:
sns.distplot(hydrated['user_listed_count'], hist=True, kde=False, 
             bins=9, color='blue',
             hist_kws={'edgecolor': 'black'})
# Add labels
plt.title('User Listed?')
plt.xlabel('user_listed_count')
plt.ylabel('Count')

In [None]:
sns.scatterplot(x=np.linspace(1, 292271, num=292271), y=hydrated['user_verified'])
# (x=hydrated['user_verified'], y=hydrated['user_statuses_count'])

Heat map pearson correlation matrix

In [None]:
corrmat = hydrated.corr()
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(corrmat, vmax=.8, square=True)

Light color, i.e., see on the right, scale 0.8 is highly correlated,<br>
and darker color below or around -0.2 is not correlated.

Your heatmap is correct, you just forgot to change the dataframe name from pottermerged --> hydrated

In [None]:
plt.figure(figsize=(30, 30))
plt.title('Pearson Correlation of Features', size=15)
colormap = sns.diverging_palette(10, 220, as_cmap=True)
sns.heatmap(hydrated.corr(),
            cmap=colormap,
            square=True,
            annot=True,
            linewidths=0.1, vmax=1.0, linecolor='white',
            annot_kws={'fontsize': 12})
plt.show()

In above correlation matrix, we printed the number also so it will be easy for us<br>
to see which are highly correlated and value close to 1.00 is highly correlated.