# Recsys Challenge

### CONTENIDO
1. Leer dataset
2. Tweet analysis
3. Data cleaning
4. Análisis estadístico
5. Recommender
6. ...
7. Profit

## 1. Leer dataset

In [18]:
# Librerías usadas
import pandas as pd 
import random 
import re
import sys
from transformers import BertTokenizer

Al tener un dataset demasiado grande, hemos creado varios archivos con muestras random del dataset original mediante el siguiente código:


In [None]:
# Muestra random de un archivo .tsv muy grande 
filename = 'training.tsv'
n = 300000000 # tamaño "total"
s = 1000000 # tamaño muestra
skip = sorted(random.sample(range(1,n+1),n-s)) # filas a saltar

# print(skip)
# DESCOMENTAR SOLO PARA OBTENER NUEVAS MUESTRAS
# df = pd.read_csv(filename, skiprows=skip, delimiter='\x01', header=None)

# Definir columnas
all_features = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type","language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified",\
               "engaging_user_account_creation", "engagee_follows_engager","reply_timestamp","retweet_timestamp","retweet_with_comment_timestamp","like_timestamp"]
df.columns=all_features

# Exportarlo a un csv
export_filename = 'tocho.csv'

# DESCOMENTAR SOLO PARA OBTENER NUEVAS MUESTRAS
# df.to_csv(export_filename, encoding='utf-8')

In [19]:
# Leer e importar a un dataframe
training = pd.read_csv('tocho.csv', index_col = 0)
# Miramos las primeras filas
training.head(5)

Unnamed: 0,text_ tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,101\t16493\t12478\t117\t10105\t42370\t76299\t1...,,39024FBE0136E046D1357196BAECFCA6,GIF,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581532200,3BF0702AA3337E076DA5DF54E55117D1,...,000006829BEADA9EEA695CF0C334B426,2,29,False,1568107028,False,,,,
1,101\t11420\t10173\t45283\t10133\t40960\t10835\...,,6F4B64CF616A1B65C94B9581AB3617E5,,,,TopLevel,ECED8A16BE2A5E8871FD55F4842F16B1,1581134453,4C86333F8DC95B0D3C716C46A7862CCC,...,00046D54CBEE1354F395930AB602C6EF,87,100,False,1500643248,False,,,,1581158000.0
2,101\t10105\t76456\t22201\t119\t119\t119\t119\t...,,6DD591ABD65945B412C062DE26C55CAC,,,,Quote,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581340818,304AD9FCC24B58A12FC7843F8DFE804E,...,000AF284D93A0EBDD1BE0D4701F581FC,583,1071,False,1465415499,True,,,,1581345000.0
3,101\t49307\t11090\t49339\t11369\t14703\t110014...,B929FC51754F08E9CF07498FF4DFC1DC,25BFBE285D27B2144C6B169C11083FAB,Photo,,,TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581332785,AF003AF390475BFCB3D3618FE55F950E,...,001C14131FC5556F3CB6555FF04F0AB1,272,1874,False,1425137677,False,,,,1581344000.0
4,101\t52212\t18010\t10120\t15122\t10325\t40323\...,,27A0AC9D2924771C97F7082EE951C1B1,Photo,,,TopLevel,717293301FE296B0B61950D041485825,1581108476,1699A4DEBFEAB17CC8F748C13F889C35,...,001DD4110BC58DBDC235D0B7A5B25ACE,1272,1243,False,1305546726,False,,,,1581109000.0


In [7]:
# Observar columnas
training.columns

Index(['text_ tokens', 'hashtags', 'tweet_id', 'present_media',
       'present_links', 'present_domains', 'tweet_type', 'language',
       'tweet_timestamp', 'engaged_with_user_id',
       'engaged_with_user_follower_count', 'engaged_with_user_following_count',
       'engaged_with_user_is_verified', 'engaged_with_user_account_creation',
       'engaging_user_id', 'engaging_user_follower_count',
       'engaging_user_following_count', 'engaging_user_is_verified',
       'engaging_user_account_creation', 'engagee_follows_engager',
       'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp'],
      dtype='object')

## 2. Tweet analysis

Para leer el contenido, usamos la librería transformers y la documentación de 'Huggingface'

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
df=training
df['text_ tokens'] = df['text_ tokens'].str.split('\t')

In [10]:
#Subset: English language
dd = df[df.language == 'D3164C7FBCF2565DDF915B1B3AEFB1DC'].copy()
dd = dd.reset_index(drop=True)

In [11]:
dd['tweet_type'].value_counts()

TopLevel    100264
Retweet      71991
Quote        18548
Name: tweet_type, dtype: int64

In [12]:
#engagement_user_type = 'engaging_user_id'
engagement_user_type = 'engaged_with_user_id'

In [13]:
tweet_type = 'TopLevel'
d_tweet_type = dd.loc[dd['tweet_type'] == tweet_type]
d_tweet_type[engagement_user_type].value_counts()[:20]

C6758D692A850E4C67B2763B66D1CFA8    658
5FF622786FB4924A067BD44D4B717570    530
E5D1B83B0E02FAFF871EEEF276D18132    373
7C03844E8B2E0C7B4346D41028AB14E2    322
FBB188A3C1E05C41587AAAC00B5B1879    253
9D9C2BC354011249F2D4D9B9C4205AC9    211
F2A8BF0F4EB185E6D2E5E1A0DF4C33AE    191
A9DAB08351D94BDE86235B37D6E8C61D    186
416B919C0DAA48D42FF6780574034149    183
AE7F3DA35832DD67B71D74467CC355C4    180
8A800256378089EF53C6F655F8690490    176
19E38BE27BE3C55DB1D7B256A9C95576    170
6866DBF67EEF238FC104B49C70A6FBA8    168
54F4098AD6C3E94A524C1CFFE68E04E1    166
4F0721594E4D39E2ABB544EB952BCCF3    153
62BDF6115A75E53FAB212D6A2A41AD9F    147
AD741205F6BAD7FFC0210BCA566C4BA5    144
BBA6EAA7EF77AD86EB62B8EA4D153710    143
8B95626DCE50B9ECD10315EB3BD819DF    143
B6D2643B021752A409EAD1BE66558700    133
Name: engaged_with_user_id, dtype: int64

In [14]:
engaging_user_id = 'FBB188A3C1E05C41587AAAC00B5B1879'
dd_user = d_tweet_type.loc[dd[engagement_user_type] == engaging_user_id]

for index, row in dd_user.iterrows():    
    print(tokenizer.decode(d_tweet_type['text_ tokens'][index]))

[CLS] Because there are fewer natural resources, the people in power across the world - - who are mostly men - - can more easily exploit women, a new report found. https : / / t. co / R6wornZPr3 [SEP]
[CLS] This time - lapse shows just how quickly a bushfire in Australia overtook firefighters as they fought to contain it https : / / t. co / GuFvHxAtIO https : / / t. co / MuVvBxRwXF [SEP]
[CLS] Grief overshadowed politics at the # Grammys, as the shocking death of Kobe Bryant hours before the event resulted in a series of tributes throughout a show held at Staples Center, the venue where the NBA superstar occupied center stage with the LA Lakers https : / / t. co / wI0PzinYOt [SEP]
[CLS] While federal investigators try to determine what caused Kobe Bryant's helicopter crash, excerpts of air traffic control recordings help build a timeline of what happened in the flight's final moments https : / / t. co / xsvkY7d0SO [SEP]
[CLS] Billie Eilish makes history as the youngest artist to win So

In [15]:
engaging_user_id = 'FBB188A3C1E05C41587AAAC00B5B1879'
dd_user = d_tweet_type.loc[dd[engagement_user_type] == engaging_user_id]
list_twits = []
valid_pattern = 'https : \/ \/ t\. co \/ [\dA-Za-z\.-]+'

for index, row in dd_user.iterrows():
    list_twits.append(re.findall(valid_pattern, tokenizer.decode(dd['text_ tokens'][index])))
    
    #print(tokenizer.decode(d_tweet_type['text_ tokens'][index]))
    
for i, s in enumerate(list_twits):
    list_twits_item = []
    for item in list_twits[i]:
        list_twits_item.append(item.replace(' ','')) # I use replace because s.strip() is not working 
        print(list_twits_item)
#print(list_twits)

['https://t.co/R6wornZPr3']
['https://t.co/GuFvHxAtIO']
['https://t.co/GuFvHxAtIO', 'https://t.co/MuVvBxRwXF']
['https://t.co/wI0PzinYOt']
['https://t.co/xsvkY7d0SO']
['https://t.co/ONj5JgNcUN']
['https://t.co/ONj5JgNcUN', 'https://t.co/vx8yVFuXYc']
['https://t.co/zl18w15YWx']
['https://t.co/zl18w15YWx']
['https://t.co/SFei3iDvG7']
['https://t.co/9Uh2b3Da5R']
['https://t.co/ABAI3gKjig']
['https://t.co/fLjiUBSsxq']
['https://t.co/ENKIXSXdrJ']
['https://t.co/Ts1UOJAtvo']
['https://t.co/1eEGnw0Oe1']
['https://t.co/1eEGnw0Oe1', 'https://t.co/EtYUQpcRqs']
['https://t.co/Dvc0yK9stk']
['https://t.co/fLjiUBSsxq']
['https://t.co/n7oIRuC57L']
['https://t.co/5TZJrOERSS']
['https://t.co/wI0PzinYOt']
['https://t.co/A0xOZX04sR']
['https://t.co/3b46nZuajF']
['https://t.co/3b46nZuajF', 'https://t.co/fgyuU3CdXe']
['https://t.co/qfrPXd9Jq1']
['https://t.co/AnKMUA7anQ']
['https://t.co/Ve5M81qPrL']
['https://t.co/UAzu7fFbv6']
['https://t.co/7QzTjMMpw4']
['https://t.co/xq2KEvvgAI']
['https://t.co/LW491bYaX

In [16]:
#CNN ID: FBB188A3C1E05C41587AAAC00B5B1879