In [56]:
import csv
import time
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from verstack.stratified_continuous_split import scsplit # pip install verstack
from nltk.corpus import stopwords 

import torch
import torch.nn.functional as F

from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import os
import gensim
from gensim.models.doc2vec import Doc2Vec
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

### 基本思路
使用如下两组要素
- 数字变量
  1. favorites count (log_stand)
  2. followers count (log_stand)
  3. statues count (发推数量) (log_stand)
  4. friends count (log_stand)
  5. urls count (log_stand)
  6. verified (1 or 0) (这一部分我们不做log_stand)
  7. timestamp (一天内的早中晚（hour），距离大选的时间(month和day)，周几(wday)都可能产生影响)
- 文本变量
  1. tweet text
  2. hashtags 

将文本变量通过embedding处理后用CNN/LSTM得到一部分输出，另外对数字变量通过CNN也得到了一部分结果，最后汇总，用2层dense layer。


In [114]:
# Load data

train_data = pd.read_csv("data/train.csv")
eval_data = pd.read_csv("data/evaluation.csv")

# Pre-process the traning data
label = "text"
train_data[label] = train_data[label].map(lambda x: str.split(x, sep=" "))


label = "mentions"
train_data[label+"_count"] = train_data[label].map(lambda x: len(str.split(x)))
mentions = set()
train_data[label].apply(lambda x: mentions.update(x))
print("all mentions in train data: ", mentions)


label = "urls"
train_data[label] = train_data[label].map(lambda x: [] if x=="[]" else [str.strip(url) for url in str.split(x[1:-1], sep=",")])
train_data[label+"_count"] = train_data[label].map(lambda x: len(x))
urls = set()
train_data[label].apply(lambda x: None if len(x)<1 else urls.update(x))
train_data[label].apply(lambda x: None if len(x)<1 else hashtags.update(x))
print(f"average number of urls in train data: {train_data.urls_count.mean():1.3f}")
print("number of distinct urls in train data: ", len(urls))

label = "verified"
print("verified users: ", train_data["verified"].sum())


label = "hashtags"
train_data[label] = train_data[label].map(lambda x: [] if x=="[]" else [str.strip(tag) for tag in str.split(x[1:-1], sep=",")])
train_data[label+"_count"] = train_data[label].map(lambda x: len(x))
hashtags = set()
train_data[label].apply(lambda x: None if len(x)<1 else hashtags.update(x))
print(f"average number of hashtags in train data: {train_data.hashtags_count.mean():1.3f}")
print("number of distinct hashtags in train data:", len(hashtags))


label = "hashtags"
eval_data[label] = eval_data[label].map(lambda x: [] if x=="[]" else [str.strip(tag) for tag in str.split(x[1:-1], sep=",")])
eval_data[label+"_count"] = eval_data[label].map(lambda x: len(x))
hashtags2 = set()
eval_data[label].apply(lambda x: None if not x else hashtags2.update(x))
print(f"average number of hashtags in evaluation data: {eval_data.hashtags_count.mean():1.3f}")
print("number of distinct hashtags in evaluation data:", len(hashtags2))



# Treatment of time
# The time relative to the election (month and yday), day in the week, 
# and the hour all affect the number of retweets.
train_data["timestamp"] = train_data["timestamp"].map(lambda x: time.gmtime(x//1000))
# Only 2380 tweets are not posted in 2022, so we ignore the year
# train_data["year"] = train_data["timestamp"].map(lambda x: x.tm_year)
train_data["month"] = train_data["timestamp"].map(lambda x: x.tm_mon)
train_data["yday"] = train_data["timestamp"].map(lambda x: x.tm_yday)
train_data["wday"] = train_data["timestamp"].map(lambda x: x.tm_wday)
train_data["hour"] = train_data["timestamp"].map(lambda x: x.tm_hour)
print("number of tweets not posted in 2022 in train data:", len(train_data[train_data["timestamp"].map(lambda x: x.tm_year) < 2022]))

# We drop the following data:
# @TweetID: useless
# @mentions: none of the mentions in the train data are not null
# @mentions_count
# @hashtags_count: we use hashtags directly
# @urls: we use urls_count instead

train_data.drop(labels=["TweetID","timestamp", "mentions","mentions_count", "hashtags_count", "urls"], axis=1, inplace=True)
train_data.head(5)

all mentions in train data:  {']', '['}
average number of urls in train data: 0.529
number of distinct urls in train data:  185951
verified users:  10621
average number of hashtags in train data: 0.297
number of distinct hashtags in train data: 12093
average number of hashtags in evaluation data: 0.299
number of distinct hashtags in evaluation data: 5942
number of tweets not posted in 2022 in train data: 2380


Unnamed: 0,text,retweets_count,favorites_count,followers_count,statuses_count,friends_count,verified,hashtags,urls_count,month,yday,wday,hour
0,"[rt, refarcir, macron, ans, nom, prépare]",3,0,3682,453535,3628,0,[],0,3,70,4,5
1,[populaire],0,0,86,1016,284,0,[],0,3,78,5,12
2,"[faut, dégager, cinglé]",3,1,1944,28234,1995,0,[],0,3,74,1,18
3,"[enseignants, mettre, prescriptions, président...",0,0,1,1072,0,0,[],1,3,73,0,11
4,"[mafieuse, oppressive, macron]",0,0,13957,25311,10841,0,[],0,3,73,0,11


In [116]:
# Here we split our training data into trainig and testing set. This way we can estimate the evaluation of our model without uploading to Kaggle and avoid overfitting over our evaluation dataset.
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.7, test_size=0.3)

# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(["retweets_count"], axis=1)
X_test = X_test.drop(["retweets_count"], axis=1)

# log transform
def log_transform(data, list_column):
        for name_column in list_column:
            data[name_column] = np.log(data[name_column])

# standardise the columns of numeric values
def standardise(data,list_column):
    for name_column in list_column:
        # standardize
        if data[name_column].std()!=0:
            data[name_column]=(data[name_column]-data[name_column].mean())/data[name_column].std()
        else:
            data[name_column]=(data[name_column]-data[name_column].mean())

standardise(X_train,["favorites_count","followers_count","statuses_count","friends_count","urls_count","month","yday","wday","hour"])
standardise(X_test, ["favorites_count","followers_count","statuses_count","friends_count","urls_count","month","yday","wday","hour"])

standardise(X_train,["favorites_count","followers_count","statuses_count","friends_count","urls_count","month","yday","wday","hour"])
standardise(X_test, ["favorites_count","followers_count","statuses_count","friends_count","urls_count","month","yday","wday","hour"])

# split the table into 2 parts: one with the text and the other with the numbers
X_train_num=X_train.drop(["text", "hashtags"], axis=1)
X_test_num=X_test.drop(["text", "hashtags"], axis=1)

X_train_text=X_train[["text", "hashtags"]]
X_test_text=X_test[["text", "hashtags"]]

In [117]:
# This part is used to train the embedding of the vocabulary, only needed to be run once

path_text_dataset='French-Word-Embeddings/Data/data.txt'
text_dataset=pd.read_table(path_text_dataset).values
text_dataset=[item[0][:-1] for item in text_dataset]

TaggedDocument=gensim.models.doc2vec.TaggedDocument

def X_text(sentences):
    X=[]
    for i,text in enumerate(sentences):
        words=text.split(" ")
        l=len(words)
        words=words[:-2]
        document=TaggedDocument(words,tags=[i])
        X.append(document)
    return X

X_documents=X_text(text_dataset)
X_documents[:5]

def train_text(text_train,size=10,epochs=10):
    model=Doc2Vec(text_train,min_count=1,window=3,vector_size=size,sample=1e-3,negative=5,epochs=epochs)
    model.train(text_train,total_examples=model.corpus_count,epochs=model.epochs)
    return model

if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')

# run only once this part  
# model_text10=train_text(X_documents, size=10)
# model_text10.save('WE_models/d2v_10D')
# model_text5=train_text(X_documents, size=5)
# model_text5.save('WE_models/d2v_5D')

In [118]:
# Here we load the embedding model:
model_text10 = Doc2Vec.load('WE_models/d2v_10D')
model_text5 = Doc2Vec.load('WE_models/d2v_5D')

In [121]:
# Tranform the textual part to vector

def text2vec(text_train, model, alpha=0.05, min_alpha=0.025, epochs=10, tags=True):
    list_text=[]
    for sentence in text_train:
        vec = model.infer_vector(doc_words=sentence, alpha=alpha, min_alpha=min_alpha, epochs=epochs).tolist()
        list_text.append(vec)
    return np.array(list_text)

# text to vector
train_text_tensor=torch.Tensor(text2vec(X_train_text["text"], model=model_text10))
test_text_tensor=torch.Tensor(text2vec(X_test_text["text"], model=model_text10))

# hashtags to vector
train_tags_tensor=torch.Tensor(text2vec(X_train_text["hashtags"], model=model_text5))
test_tags_tensor=torch.Tensor(text2vec(X_test_text["hashtags"], model=model_text5))

# combine the two vectors
X_train_text_tensor = torch.cat((train_text_tensor, train_tags_tensor), axis=1)
X_test_text_tensor = torch.cat((test_text_tensor, test_tags_tensor), axis=1)


(torch.Size([247778, 15]), torch.Size([106191, 15]))

In [126]:
X_train_num_tensor=torch.Tensor(X_train_num.values)
X_test_num_tensor=torch.Tensor(X_test_num.values)

X_train_tensor=torch.cat((X_train_num_tensor, X_train_text_tensor), axis=1)
X_test_tensor=torch.cat((X_test_num_tensor, X_test_text_tensor), axis=1)

y_train_tensor=torch.Tensor(y_train.to_numpy()).reshape((-1,1))
y_test_tensor=torch.Tensor(y_test.to_numpy()).reshape((-1,1))

print(X_train_num_tensor.shape, y_train_tensor.shape)
print(X_train_text_tensor.shape, X_test_text_tensor.shape)
print(X_train_tensor.shape, X_test_tensor.shape)

torch.Size([247778, 10]) torch.Size([247778, 1])
torch.Size([247778, 15]) torch.Size([106191, 15])
torch.Size([247778, 25]) torch.Size([106191, 25])
