In [55]:
import pandas as pd
from sklearn.cluster import DBSCAN
from math import radians, sin, cos, sqrt, atan2
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [26]:
#importing data 
frame = pd.read_csv('../data/02_intermediate/cleaned_data.csv')
frame

Unnamed: 0,country,city,lat,lon,user_id
0,Polska,Kraków,50.046910,19.997064,1022388809810563072
1,Polska,Polska,52.097718,19.025816,1022388809810563072
2,Czad,Czad,15.613414,19.015617,1022388809810563072
3,Czad,Czad,13.021107,14.580308,1022388809810563072
4,Polska,Poznań,52.408266,16.933520,1022388809810563072
...,...,...,...,...,...
9887,Polska,Warszawa,52.231924,21.006726,111313124
9888,USA,Warsaw,37.958746,-76.758021,111313124
9889,Polska,Warszawa,52.233717,21.071411,111313124
9890,Polska,Warszawa,52.233717,21.071411,111313124


In [None]:
answer_frame = pd.read_csv('../data/01_raw/users.csv')

In [27]:
# remove not from poland
frame = frame[frame['country'] == "Polska"]

In [198]:
# remove city Polska, Niemcy
frame = frame[(frame['city'] != "Polska") & (frame['city'] != "Niemcy") ]
len(frame)

2759

In [192]:
# remove those with small number of rows smaller than 5
counts = frame.groupby(['user_id'])['user_id'].count()
counts = counts[counts >= 5]
proper_users = counts.keys().to_list()


In [193]:
# spliting frame for every user
users_frames_list = []
for user_id in proper_users:
    users_frames_list.append(frame[frame['user_id'] == user_id].copy())

In [194]:
# Define a function to compute the Haversine distance between two points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # radius of Earth in kilometers
    dLat = radians(lat2 - lat1)
    dLon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)

    a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
    c = 2*atan2(sqrt(a), sqrt(1 - a))
    return R * c

# Define a distance metric based on the Haversine function
def haversine_distance(x1, x2):
    lat1, lon1 = x1[0], x1[1]
    lat2, lon2 = x2[0], x2[1]
    return haversine(lat1, lon1, lat2, lon2)

In [199]:
user_id = 2
input_frame = users_frames_list[user_id].copy()
input_frame.drop(labels=['country','city','user_id'], axis=1, inplace=True)
scaller = MinMaxScaler(feature_range= (-1,1))
scaller.fit(input_frame)
input_frame = scaller.transform(input_frame)
dbscan = DBSCAN(eps=0.05, min_samples=2, metric=haversine_distance)
dbscan.fit(input_frame)
dbscan.labels_


array([-1,  0, -1, -1, -1, -1,  1,  2,  3,  2,  3,  4, -1, -1, -1,  0, -1,
        5,  5,  0,  5,  5,  5,  0,  5,  5,  0,  6,  7,  6,  7,  1,  8,  1,
        8, -1, -1,  7,  0,  0,  0,  0,  5,  2,  0,  0,  0,  0, -1,  4, -1,
       -1, -1, -1,  9, 10,  9, 10,  5,  0,  0])

In [200]:
set(dbscan.labels_)
reslut_frame = users_frames_list[user_id].copy()
reslut_frame['cluster'] = dbscan.labels_
reslut_frame = reslut_frame[reslut_frame['cluster'] != -1]
biggest_cluster = pd.Series.idxmax(reslut_frame.groupby(['cluster'])['cluster'].count())


In [201]:
reslut_frame[reslut_frame['cluster'] == biggest_cluster]



Unnamed: 0,country,city,lat,lon,user_id,cluster
9798,Polska,Warszawa,52.233717,21.071411,111313124,0
9833,Polska,Warszawa,52.233717,21.071411,111313124,0
9839,Polska,Warszawa,52.233717,21.071411,111313124,0
9845,Polska,Warszawa,52.233717,21.071411,111313124,0
9849,Polska,Warszawa,52.233717,21.071411,111313124,0
9861,Polska,Warszawa,52.233717,21.071411,111313124,0
9864,Polska,Warszawa,52.233717,21.071411,111313124,0
9865,Polska,Warszawa,52.233717,21.071411,111313124,0
9866,Polska,Warszawa,52.233717,21.071411,111313124,0
9870,Polska,Warszawa,52.233717,21.071411,111313124,0


In [206]:
reslut_frame.sort_values(["cluster"])

Unnamed: 0,country,city,lat,lon,user_id,cluster
9798,Polska,Warszawa,52.233717,21.071411,111313124,0
9876,Polska,Warszawa,52.233717,21.071411,111313124,0
9873,Polska,Warszawa,52.233717,21.071411,111313124,0
9872,Polska,Warszawa,52.233717,21.071411,111313124,0
9870,Polska,Warszawa,52.233717,21.071411,111313124,0
9866,Polska,Warszawa,52.233717,21.071411,111313124,0
9865,Polska,Warszawa,52.233717,21.071411,111313124,0
9864,Polska,Warszawa,52.233717,21.071411,111313124,0
9861,Polska,Warszawa,52.233717,21.071411,111313124,0
9889,Polska,Warszawa,52.233717,21.071411,111313124,0


In [203]:
answer_frame[answer_frame['id'] == reslut_frame.iloc[0]['user_id']]['location']

685    Earth
Name: location, dtype: object

In [204]:
answer_frame[answer_frame['id'] == reslut_frame.iloc[0]['user_id']]

Unnamed: 0.1,Unnamed: 0,username,name,protected,public_metrics,profile_image_url,id,description,location,created_at
685,688,podkowa_68,podkow@,False,,https://pbs.twimg.com/profile_images/755337311...,111313124,I ♥  & ***** *** !,Earth,2010-02-04T13:58:30.000Z
