# Classification: Members of the Greek Parliament Dataset
---

Libraries:

In [1]:
from batoomer.twitter_nodes.search_engine import TwitterSearchEngine
import numpy as np
import pandas as pd
import tweepy
import json
from tqdm.notebook import tqdm
import time

TwitterAPI authentication:

In [2]:
twitter_credentials = []
with open('../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)
    
# TwitterAPI Authentication
auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'], twitter_credentials['access_token_secret'])
API = tweepy.API(auth)

## Data Collection
---
### Step 1

We use the dataset of the members of the parliament, which was provided, and collect the following information for each politician with a Twitter Account and label them with the integer 1:
- screen_name
- name
- description
- statuses_count
- favourites_count
- friends_count
- followers_count
- default_profile_image



For two members of the parliament, we got an error:
- Κατσαφάδος Κώστας (katsafados): Account does not exist.
- Στέργιος Γιαννάκης (stergiannakis): Account suspended.

In [3]:
df = pd.read_csv('../../../parliament-members-new.csv', delimiter=';')
df = df.replace(np.nan, '')
df = df[df['Twitter Handle'] != '']

failed = []
results = []

for account in tqdm(df['Twitter Handle']):
    try:
        user = API.get_user(account)
        results.append(
            [user.screen_name, user.name, user.description ,user.statuses_count, user.favourites_count, 
             user.friends_count, user.followers_count, user.default_profile_image])
    except tweepy.RateLimitError as err:
        print('Rate Limit Hit. Wait 15 min.')
        time.sleep(60*15)
        
    except Exception as err:
        print(f'Errror for {account}: {err}')
        failed.append(account)

  0%|          | 0/242 [00:00<?, ?it/s]

Errror for katsafados: [{'code': 50, 'message': 'User not found.'}]
Errror for stergiannakis: [{'code': 63, 'message': 'User has been suspended.'}]


In [5]:
politicians = pd.DataFrame(results)
politicians.columns = ['screen_name', 'name', 'description', 'statuses_count', 'favourites_count', 'friends_count', 'followers_count', 'default_profile_image']
politicians['parliament_member'] = [1 for i in range(len(politicians))]
politicians.head(5)

Unnamed: 0,screen_name,name,description,statuses_count,favourites_count,friends_count,followers_count,default_profile_image,parliament_member
0,ViliardosV,Βασίλης Βιλιάρδος,"Οικονομολόγος, ΑΣΟΕΕ Αθηνών, με μεταπτυχιακά σ...",14936,3644,274,4602,False,1
1,anaik2020,ANASTASIA-EKATERINI ALEXOPOULOU,Βουλευτής Β1 Βόρειου Τομέα Αθηνών\nΕΛΛΗΝΙΚΗ ΛΥ...,3314,17496,494,394,False,1
2,MariaAt03902914,Maria Athanasiou- ΕΛΛΗΝΙΚΗ ΛΥΣΗ,Ελληνική Λύση,4825,11018,84,192,False,1
3,B3Vagenas,Δημητρης Βαγενας,Βουλευτής Νότιου Τομέα Αθηνών Β3 Ελληνικής Λύσ...,42,274,35,46,False,1
4,SofAsimak,ΣΟΦΙΑ ΑΣΗΜΑΚΟΠΟΥΛΟΥ - ΕΛΛΗΝΙΚΗ ΛΥΣΗ,,8975,422,860,360,False,1


In [22]:
len(politicians)

240

### Step 2
We use each politician's surname as a query for the search_users endpoint, to collect non-politician accounts.<br>
First, for each query, we keep the first three accounts returned by the endpoint. Next we remove the politicians <br>
accounts from the results and than we extract the same information we extracted for the politicians and label them with 0.<br>
Than we keep all account with a description and append accounts without descriptions until we have 240 accounts that possibly<br>
are not members of the parliament. To be sure we export this accounts to a .csv file, and check if there are any politicians <br>
if there are we remove them.

In [13]:
se = TwitterSearchEngine(twitter_credentials)
queries = list(df['Surname'])
results_fullnames = pd.DataFrame()
count = 2

for query in tqdm(queries):
    if query:
        try:
            se.search(query=query, count=count)
            result = se.get_results()
            results_fullnames = results_fullnames.append(result)
        except tweepy.RateLimitError as err:
            print('Rate Limit Hit! Sleeping for 15mins.')
            time.sleep(60*15)
            se.search(query=query, count=count)
            result = se.get_results()
            results_fullnames = results_fullnames.append(result)

  0%|          | 0/242 [00:00<?, ?it/s]

In [15]:
res1 = list(results_fullnames['Result 1'])
res2 = list(results_fullnames['Result 2'])
res3 = list(results_fullnames['Result 3'])
res = list(set(res1 + res2 + res3))

to_remove = list(df['Twitter Handle'])
to_remove = to_remove + [np.nan]

for item in to_remove:
    if item in res:
        res.remove(item)

len(res)

466

In [16]:
failed = []
results = []

for account in tqdm(res):
    try:
        user = API.get_user(account)
        results.append(
            [user.screen_name, user.name, user.description ,user.statuses_count, user.favourites_count, 
             user.friends_count, user.followers_count, user.default_profile_image])
    except tweepy.RateLimitError as err:
        print('Rate Limit Hit. Wait 15 min.')
        time.sleep(60*15)
        
    except Exception as err:
        print(f'Unknown error: {err}')
        failed.append(account)

  0%|          | 0/466 [00:00<?, ?it/s]

In [34]:
not_pol = pd.DataFrame(results)
not_pol.columns = ['screen_name', 'name', 'description', 'statuses_count', 'favourites_count', 'friends_count', 'followers_count', 'default_profile_image']
not_pol['parliament_member'] = [0 for i in range(len(not_pol))]

In [35]:
not_pol_desc = not_pol[not_pol['description'] != '']
len(not_pol_desc)

206

In [36]:
not_pol_no = not_pol[not_pol['description'] == '']

In [37]:
not_pol_desc = not_pol_desc.append(not_pol_no.iloc[:(240-206)])
len(not_pol_desc)

240

In [41]:
not_pol_desc.to_csv('not_parliament_members.csv', index = False)

In [66]:
not_pol_desc = pd.read_csv('not_parliament_members_processed.csv', delimiter=';')
not_pol_desc.replace(np.nan, '', inplace=True)
len(not_pol_desc)

203

### Step 3
We merge our results to create a training and a validation set.<br>
First we shuffle each of the dataset we are going to merge.


In [67]:
not_pol_desc = not_pol_desc.sample(frac=1).reset_index(drop=True)
len(not_pol_desc)

203

In [68]:
politicians = politicians.sample(frac=1).reset_index(drop=True)

In [69]:
validation_set = politicians.iloc[:50].copy(1)
validation_set = validation_set.append(not_pol_desc.iloc[:50])

In [72]:
training_set = politicians.iloc[50:].copy(1)
training_set = training_set.append(not_pol_desc.iloc[50:])

In [74]:
len(training_set)

343

In [75]:
training_set.pivot_table(index=['parliament_member'], aggfunc='size')

parliament_member
0    153
1    190
dtype: int64

In [76]:
training_set = (training_set.groupby('parliament_member', as_index=False)
        .apply(lambda x: x.sample(n=150))
        .reset_index(drop=True))
training_set.pivot_table(index=['parliament_member'], aggfunc='size')

parliament_member
0    150
1    150
dtype: int64

In [79]:
validation_set.to_csv('parliament_members_validation_set.csv', index=False)
training_set.to_csv('parliament_members_training_set.csv', index=False)