In [None]:
import pandas as pd
import json
from collections import Counter
import numpy as np
from datetime import datetime
import re
import operator
import csv
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.tools as tls

# Task 0: Data preparation (skip it if you want to see a completed task)

# Load and transform data from json

Initial load of json file into dataframe

In [None]:
inst_df = pd.read_json('../datasets/instagram_followers.json', lines=True)

Let's see how data looks like and how many rows we have

In [None]:
inst_df.head(5)

In [None]:
rows_count = len(inst_df)
inst_df.count()

In [None]:
inst_df.dtypes # types of columns

### Let's explore this dataset column by column.
### And also make all necessary transformations to our dataframe by the way

## Audience

Let's check if we have not NaNs in "audience" column

In [None]:
print("Audience NaNs = " + str(inst_df['audience'].isnull().sum()))
print("Rows count = " + str(rows_count))

So we have only 24 people in our dataset with audience field

Let's see how not NaN audience looks like

In [None]:
audience_series = inst_df['audience'].dropna()
audience_series

## is_deleted

Maybe we can use "id_deleted" column to indicate if an account is fake?

In [None]:
print("Rows count = " + str(rows_count))
Counter(inst_df['is_deleted'])

Okay, it seems that we have only active users. <br><br>
We can get rid of this column

In [None]:
if 'is_deleted' in inst_df.columns:
    del inst_df['is_deleted']
inst_df.head(1)

## follows

Let's see, how many people in our dataset have not empty followers field

In [None]:
people_with_followers = inst_df[inst_df['follows'].str.len() != 0]
len(people_with_followers)

In [None]:
people_with_followers.head(1)['follows'][3]

So followers are represented as list of dictionaries with each follower in list represented as 
1. datetime (maybe datetime of the beginning of following)
2. uid 

We can change "$date" field from string to datetime for better future analysis

In [None]:
dt_format = '%Y-%m-%dT%H:%M:%S.000Z'

for row in people_with_followers['follows']:
    for follower in row:
        if type(follower['t']['$date']) == str:
            follower['t'] = datetime.strptime(follower['t']['$date'], dt_format)

In [None]:
inst_df['follows'][3]

## hashtags

Let's explore how hashtags look like

In [None]:
inst_df.head(1)['hashtags'][0]

field 'p' doesn't seem to be very useful, but "'t': {'$date'}" and txt are useful <br><br>
Let's transform "hashtags" field to the form of dictionaries with key as txt and value as datetime

In [None]:
people_with_hashtags = inst_df[inst_df['hashtags'].str.len() != 0]

In [None]:
def transform_hashtags(hashtags_list):
    if hashtags_list is not None:
        new_list = []
        for single_dict in hashtags_list:
            if type(single_dict['t']['$date']) == str:
                single_dict['t']['$date'] = datetime.strptime(single_dict['t']['$date'], dt_format)
            new_list.append({single_dict['txt']: single_dict['t']['$date']})
            
        return new_list

In [None]:
inst_df['hashtags'] = people_with_hashtags['hashtags'].apply(transform_hashtags)

In [None]:
inst_df.head(1)['hashtags'][0]

## ig_username

Type of "ig_username" is already a string, so we don't need to convert it manually

In [None]:
type(inst_df['ig_username'][0])

## likes

Let's look how "likes" look like

In [None]:
inst_df.head(1)['likes'][0]

We have 3 fields:
1. uid of the liked person
2. pid - process id (don't know why it is given) 
3. datetime - datetime of the like

In [None]:
likes_uid_list = []
for i in inst_df.head(1)['likes'][0]:
    likes_uid_list.append(i['uid'])

counter = Counter(likes_uid_list)
counter

In [None]:
people_with_likes = inst_df[inst_df['likes'].str.len() != 0]

In [None]:
def transform_likes(likes_list):
    if likes_list is not None:
        new_list = []
        for single_dict in likes_list:
            if type(single_dict['t']['$date']) == str:
                single_dict['t'] = datetime.strptime(single_dict['t']['$date'], dt_format)
            new_list.append(single_dict)
        return new_list

In [None]:
inst_df['likes'] = people_with_likes['likes'].apply(transform_likes)

In [None]:
inst_df['likes'][0]

Here we transformed 't' field to datetime for future analysis

## mentions

In [None]:
inst_df['mentions'][0]

In [None]:
people_with_mentions = inst_df[inst_df['mentions'].str.len() != 0]

In [None]:
def transform_mentions(mentions_list):
    if mentions_list is not None:
        new_list = []
        for single_dict in mentions_list:
            if type(single_dict['t']['$date']) == str:
                single_dict['t'] = datetime.strptime(single_dict['t']['$date'], dt_format)
            new_list.append(single_dict)
        return new_list

In [None]:
inst_df['mentions'] = people_with_mentions['mentions'].apply(transform_mentions)

In [None]:
inst_df['mentions'][0]

Here we alse transformed 't' field to datetime for future analysis

## update_time and time_add

In [None]:
def dict_to_date(x):
    x = datetime.strptime(x['$date'], dt_format)
    return x

inst_df['update_time'] = inst_df['update_time'].apply(dict_to_date)
inst_df['time_add'] = inst_df['time_add'].apply(dict_to_date)

In [None]:
inst_df.head(1)

# Task 1: the most popular name

## user

In [None]:
full_name_list = [x['full_name'] for x in inst_df['user']]
full_name_list = list(filter(None, full_name_list))
full_name_list

In [None]:
new_names = []
for name in full_name_list:
    # Remove non-ascii characters
    name = name.encode('ascii', errors='ignore').decode('ascii')
    
    name = name.strip()
    if name == '':
        continue
    
    if len(name) >= 4 and name[0].isalnum() == True and \
       name[1] == ' ' and \
       name[2].isalnum() == True and \
       name[3] == ' ':
        name = name.replace(' ', '')
    
    new_names.append(name)

new_names

In [None]:
count = {}

for name in new_names:
    for word in name.split():
        if word.lower() in count:
            count[word.lower()] += 1
        else:
            count[word.lower()] = 1
            
count

In [None]:
sorted_count = sorted(count.items(), key=operator.itemgetter(1))
sorted_count.reverse()
sorted_count

### Seeing top of the counted names we can see that we are not entirely correct with our estimates - we haven't excluded pronouns, articles and single letters
### So to remove all of them we need an external resource of names

The first I found was:
http://deron.meranda.us/data/census-derived-all-first.txt

Let's load it and then filter our counted names

In [None]:
with open("census-derived-all-first.txt") as f:
    reader = csv.reader(f, delimiter=' ')
    d = list(reader)
    # Remove everything but names
    list_of_names = []
    for i in d:
        list_of_names.append(i[0].lower())

list_of_names

In [None]:
names_to_remove = []
for name_count in sorted_count:
    if name_count[0] not in list_of_names:
        names_to_remove.append(name_count)

In [None]:
for i in names_to_remove:
    sorted_count.remove(i)

sorted_count

Let's insure that there's actually names "my" and "in" in the list of names

In [None]:
print(list_of_names.index('my'))
print(list_of_names.index('in'))

Now we can paint a graphic that shows, for example, top 10 names

In [None]:
top_ten = sorted_count[:10]
top_ten_names = [i[0] for i in top_ten]
top_ten_values = [i[1] for i in top_ten]

plt.xticks(range(10), top_ten_names, rotation='vertical')
plt.scatter(range(10), top_ten_values)
for i in range(10):
    plt.annotate(top_ten_values[i], xy=(i,top_ten_values[i]))
plt.show()

### <u>Result: "nicole" - the most popular name (after "queen" and "love" which could be used not in term of name)</u>

# Task 2 Country analysis

<b>Определить город и страну каждого пользователя по имеющимся данным</b>

It's a complex task, because in this dataset we almost have no geodata - data, which is connected to certain geolocation.

<i>I said almost because we actually have an "audience" attribute, but it is not NaN for just 24 people in this dataset. Of course, it's not enough to put beside every person in this dataset their country.</i>

So we need multiple criterias with different weights to categorize users by their country.

<b>Here is my suggestions on the criterias which can be used:</b>

### Here is my suggestions on the criterias which can be used:

### 1. Audience field

Let's watch at the example of not NaN audience field

In [None]:
audience_series[111]

So here we have very interesting stats about the audience:
 1. Percent of audience by gender
 2. Percent of audience by country

And we can put it to good use

#### Suggestion 1: If a person have a sugnificant percent of audience (let's say more than 50%) in one country, then it's very likely that he/she is lives in this particular country

### 2. "about" field in "user"

Let's look at "user" field of the first user

In [None]:
inst_df['user'][0]

In 'about' we clearly see "Toronto" which is a Canada city.

#### Suggestion 2: Parse 'about' field and find there country names and city names of the person

### 3. "connected_fb_page" field in "user"

If this field isn't empty, we can connect to the facebook page of the person and find country name there, if it is specified
#### Suggestion 3: Parse connected facebook page of the person and find country name there

### 4. "connected_fb_page" and "about" field of the "mentions" field and "likes" field

We can take another, more clever strategy of finding a person's country. <br>
I'm not certain, what "mention" means (since I'm not using Instagram), so I will assume that it is a mention of some other person

#### Suggestion 4: Parse connected facebook page of mentioned and liked people of the person (since we have uids of them) and find country name there. 
#### For example, if there are more than 50% of them from one certain country, then we can suggest that the person who mentioned and liked them is also from this country


### 5. Timezone based on datetime, when "likes" and "posts" are made

This is the last and the most unlikely to work correctly suggestion, it needs further investigation in whether it will work or not.<br><br>
Althought itself this method is useless since ideally from this method we can get only timezone of the person but not the country in this timezone. Nevertheless if it is used with other methods, it can confirm or reject country suggestions, which was given by other methods.

#### Suggestion 5: Since we have datetime of likes, mentions and posts in UTC, we can try to define timezone of the person (based on when he/she does liked, mentions and posts) to confirm or reject coutries, given by other methods.

# Task 3 and 4 Classification of users to real and fake

3. Часть​ ​ предоставленных​ ​ пользователей​ ​ – ​ ​ фейковые​ ​ (созданные​ ​ с ​ ​ целью​ ​ накрутки​ ​ активности).
Найдите​ ​ признаки​ ​ (или​ ​ создайте​ ​ новые​ ​ на​ ​ основе​ ​ имеющихся),​ ​ по​ ​ которым​ ​ можно​ ​ отличить
фейковых​ ​
пользователей​ ​ от​ ​ настоящих​ ​ и ​ ​ обоснуйте​ ​ причины​ ​ их​ ​ выбора.
4. С​ ​ помощью​ ​ методов​ ​ машинного​ ​ обучения​ ​ (из​ ​ пакета​ ​ scikit-learn)​ ​ реализуйте​ ​ модель,​ ​
способную
классифицировать​ ​ аккаунты​ ​ на​ ​ реальные​ ​ и ​ ​ фейковые.

### 1. Fake and real users' features suggestions

#### Suggestion 1: Posts count from time of creation (I suppose it is "time_add"). If there's lots of posts from the starting point, it's likely that it is a fake account (need to be used with other metrics, real user also can make lots of posts)
#### Suggestion 2: Posts count from time of creation and an average time between posts (fake are those, who post has low time of creation and small average time between posts)
#### Suggestion 3: Posts count from time of creation and an average time between likes (fake are those, who post has low time of creation and small average time between likes)

### All of these suggestions are similar, so further investigation is needed to find another metrics for classification. 

### 2. Apply machine learning for classification of fake and real users

Creating a model with not labeled data (since in our dataset we don't have labels near each account whether it's fake or not) is called unsupervised learning (рус. "обучение без учителя")

So how to apply machine learning to this problem?

<b>Cluster analysis</b>

Given "fake and real users' features suggestions" we can make a cluster analysis of data based on those features. <br>
<br>
We should have 2 clusters - real and fake users.
<br>
<br>
Taking some cluster algorithm, such as:
1. Prototype-based clustering (for example, k-means)
2. Agglomerative hierarchical clustering (for example, with single/complete/average linkage)
3. Density-based clustering (for example, DBSCAN)
4. Graph-based clustering (for example, spectral clustering)

we can check, if data based on this exact features are really clustering into two different clusters.

After applying cluster algorithm we can check its correctness by visualizing clusters through scatter plots and performing analysis called "silhouette analysis"

<b>Those features, which will divide data into 2 clusters (real and fake users), will prove that they can be used for classification of the data.</b>

<i>Unfortunatery, I didn't have enough time to apply some of these method and them perform a classification based on clustering which had the best results.</i>
<br>
<i>But all of these knowledge I've got from reading a book called "Python Machine Learning" and a chapter called "Working with unlabeled data - Clustering analysis".</i>