In [1]:
import csv
import numpy as np
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, confusion_matrix, plot_confusion_matrix
from verstack.stratified_continuous_split import scsplit # pip install verstack
from nltk.corpus import stopwords 
from collections import Counter
import time
import matplotlib.pyplot as plt
import os
from catboost import CatBoostRegressor, Pool, metrics, cv
from sklearn.metrics import accuracy_score

## Hyperparameters

In [14]:
n_estimators_classifier = 100
n_iterations_class_0 = 3000
n_iterations_class_1 = 10000
n_iterations_class_2 = 5000

## Data processing

In [6]:
def hash_to_list(x):
    return [ (i.strip())[1:-1] for i in x[1:-1].split(",")]

def extract_part_of_link(x):
    return [i.split('/')[2] if i!="" else "" for i in x]

def add_features(dataset):
    #Here we add new features
    dataset["hashtags"] = dataset["hashtags"].apply(hash_to_list)
    dataset["urls"] = dataset["urls"].apply(hash_to_list)
    dataset["mentions"] = dataset["mentions"].apply(hash_to_list)
    dataset['day_of_the_year'] = dataset.apply(lambda row: int(time.strftime("%j", time.gmtime(row.timestamp))), axis=1)
    dataset['month_of_the_year'] = dataset.apply(lambda row: int(time.strftime("%m", time.gmtime(row.timestamp))), axis=1)
    dataset['day_of_the_month'] = dataset.apply(lambda row: int(time.strftime("%d", time.gmtime(row.timestamp))), axis=1)
    dataset['hour_of_the_day'] = dataset.apply(lambda row: int(time.strftime("%H", time.gmtime(row.timestamp))), axis=1)
    dataset['text_len'] =  dataset['text'].apply(lambda row: len(row.split(' ')))
    
    
    def column_count(x):
        return sum([d[i] if i != "" else 0 for i in x])

    columns = ["urls", "mentions", "hashtags"]
    for c in columns:
        d = dict()
        for urls in dataset[c]:
            if urls[0] == "":
                continue
            for url in urls:
                if url in d.keys():
                    d[url]+=1
                else:
                    d[url] = 1
        dataset[f"{c}_count"] = dataset[f"{c}"].apply(column_count)

In [7]:
os.chdir('/homes/dberezhnaia/mediastinum-segm/')
# Load the training data and evaluation data
train_data = pd.read_csv("train.csv")
eval_data = pd.read_csv("evaluation.csv")

# Add new features for these datasets
add_features(train_data)
add_features(eval_data)

# number of retweets -> for regression
y_train = train_data['retweets_count'] 
#classes labels for classification
y_train_cl = y_train.apply(lambda row: (0 if row <= 3 else (1 if row <= 90 else 2))) 
#log of number of retweets for the third class regression
y_train_log = np.log(y_train)

# We remove the actual number of retweets from our features since it is the value that we are trying to predict
train_data = train_data.drop(['retweets_count'], axis=1)

In [8]:
# We set up an Tfidf Vectorizer that will use the top 100 tokens from the tweets. We also remove stopwords.
# To do that we have to fit our training dataset and then transform both the training and testing dataset. 
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
X_train_text = vectorizer.fit_transform(train_data['text'])
eval_data_text = vectorizer.transform(eval_data['text'])

#We chose coumns for regression models and for classifier models
train_columns = ["favorites_count","followers_count", "friends_count", "verified", "timestamp", 
                 "urls_count", "hashtags_count"]

train_columns_cl = ["favorites_count","followers_count", "friends_count", "verified", "timestamp",
                    "statuses_count", "day_of_the_year", "month_of_the_year", "hour_of_the_day",
                    "urls_count", "hashtags_count"]

#We concate all text features with other features for both regression and classification models
X_train_all = np.hstack((train_data.loc[:, train_columns].to_numpy(), X_train_text.toarray()))
eval_all = np.hstack((eval_data.loc[:, train_columns].to_numpy(), eval_data_text.toarray()))

X_train_all_cl = np.hstack((train_data.loc[:, train_columns_cl].to_numpy(), X_train_text.toarray()))
eval_all_cl = np.hstack((eval_data.loc[:, train_columns_cl].to_numpy(), eval_data_text.toarray()))

## Classification

In [11]:
#We chose classifier model for class prediction
cl = GradientBoostingClassifier(n_estimators=n_estimators_classifier)

# We fit our model using the training data
cl.fit(train_data.loc[:,train_columns_cl] , y_train_cl)

# And then we predict the classes for our testing set
y_pred_cl = cl.predict(eval_data.loc[:,train_columns_cl])

In [12]:
#Here we divide our data into there classes for training each of there models separately
def find_classes(labels):
    ids_0 = []
    ids_1 = []
    ids_2 = []
    for i,label in enumerate(labels):
        if label == 0:
            ids_0.append(i)
        elif label == 1:
            ids_1.append(i)
        else:
            ids_2.append(i)
    return(ids_0, ids_1, ids_2)

In [13]:
ids_0, ids_1, ids_2 = find_classes(y_train_cl)

## Regression

In [15]:
#Here we define regression model for the 1st class and train it on the train data
model_0 = CatBoostRegressor(
    iterations=n_iterations_class_0,
    random_seed=42,
    train_dir=os.chdir('/homes/dberezhnaia/mediastinum-segm/catboost_output')
)
categorical_features_indices = []

model_0.fit(
    X_train_all[ids_0],#X_train.loc[:,train_columns], 
    y_train.iloc[ids_0],
    cat_features=categorical_features_indices,
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.041281
0:	learn: 1.0902766	total: 77ms	remaining: 3m 50s
1:	learn: 1.0748264	total: 96ms	remaining: 2m 23s
2:	learn: 1.0605368	total: 121ms	remaining: 2m
3:	learn: 1.0473083	total: 142ms	remaining: 1m 46s
4:	learn: 1.0347978	total: 161ms	remaining: 1m 36s
5:	learn: 1.0231862	total: 180ms	remaining: 1m 29s
6:	learn: 1.0123662	total: 200ms	remaining: 1m 25s
7:	learn: 1.0024780	total: 218ms	remaining: 1m 21s
8:	learn: 0.9932092	total: 237ms	remaining: 1m 18s
9:	learn: 0.9845523	total: 254ms	remaining: 1m 16s
10:	learn: 0.9765318	total: 274ms	remaining: 1m 14s
11:	learn: 0.9691365	total: 291ms	remaining: 1m 12s
12:	learn: 0.9621770	total: 308ms	remaining: 1m 10s
13:	learn: 0.9557948	total: 326ms	remaining: 1m 9s
14:	learn: 0.9499370	total: 342ms	remaining: 1m 8s
15:	learn: 0.9444355	total: 361ms	remaining: 1m 7s
16:	learn: 0.9393543	total: 378ms	remaining: 1m 6s
17:	learn: 0.9347687	total: 396ms	remaining: 1m 5s
18:	learn: 0.9304818	total: 414ms	remaining: 1m 4s
19:	

In [16]:
#Here we define regression model for the 2nd class and train it on the train data
model_1 = CatBoostRegressor(
    iterations=n_iterations_class_1,
    random_seed=42,
    train_dir=os.chdir('/homes/dberezhnaia/mediastinum-segm/catboost_output')
)
categorical_features_indices = []

model_1.fit(
    X_train_all[ids_1],
    y_train.iloc[ids_1],
    cat_features=categorical_features_indices,
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.011552
0:	learn: 16.6781228	total: 9.81ms	remaining: 1m 38s
1:	learn: 16.5595031	total: 17.5ms	remaining: 1m 27s
2:	learn: 16.4444720	total: 24.7ms	remaining: 1m 22s
3:	learn: 16.3272619	total: 32.6ms	remaining: 1m 21s
4:	learn: 16.2141308	total: 39.5ms	remaining: 1m 19s
5:	learn: 16.1017213	total: 47.1ms	remaining: 1m 18s
6:	learn: 15.9909599	total: 55ms	remaining: 1m 18s
7:	learn: 15.8805676	total: 62.1ms	remaining: 1m 17s
8:	learn: 15.7736485	total: 69.3ms	remaining: 1m 16s
9:	learn: 15.6684390	total: 76.5ms	remaining: 1m 16s
10:	learn: 15.5624515	total: 84.4ms	remaining: 1m 16s
11:	learn: 15.4593683	total: 91.6ms	remaining: 1m 16s
12:	learn: 15.3568207	total: 99.3ms	remaining: 1m 16s
13:	learn: 15.2565739	total: 107ms	remaining: 1m 16s
14:	learn: 15.1593909	total: 114ms	remaining: 1m 16s
15:	learn: 15.0625131	total: 122ms	remaining: 1m 15s
16:	learn: 14.9704092	total: 129ms	remaining: 1m 15s
17:	learn: 14.8777903	total: 137ms	remaining: 1m 15s
18:	learn: 14.7

In [17]:
#Here we define regression model for the 3rd class and train it on the train data
model_2 = CatBoostRegressor(
    iterations=n_iterations_class_2,
    random_seed=42,
    train_dir=os.chdir('/homes/dberezhnaia/mediastinum-segm/catboost_output')
)
categorical_features_indices = []

model_2.fit(
    X_train_all_cl[ids_2], 
    y_train_log.iloc[ids_2],
    cat_features=categorical_features_indices,
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.015075
0:	learn: 1.0053259	total: 6.39ms	remaining: 31.9s
1:	learn: 0.9943416	total: 10.3ms	remaining: 25.6s
2:	learn: 0.9838493	total: 14ms	remaining: 23.4s
3:	learn: 0.9731794	total: 17.9ms	remaining: 22.4s
4:	learn: 0.9626595	total: 21.7ms	remaining: 21.7s
5:	learn: 0.9524836	total: 25.4ms	remaining: 21.1s
6:	learn: 0.9424625	total: 29ms	remaining: 20.7s
7:	learn: 0.9326891	total: 32.9ms	remaining: 20.5s
8:	learn: 0.9231080	total: 37.5ms	remaining: 20.8s
9:	learn: 0.9138905	total: 41.4ms	remaining: 20.7s
10:	learn: 0.9048734	total: 45.5ms	remaining: 20.6s
11:	learn: 0.8958756	total: 51.4ms	remaining: 21.4s
12:	learn: 0.8869927	total: 55.6ms	remaining: 21.3s
13:	learn: 0.8783045	total: 59.4ms	remaining: 21.1s
14:	learn: 0.8697935	total: 63.8ms	remaining: 21.2s
15:	learn: 0.8609400	total: 67.3ms	remaining: 21s
16:	learn: 0.8525450	total: 71.6ms	remaining: 21s
17:	learn: 0.8442677	total: 75.4ms	remaining: 20.9s
18:	learn: 0.8361778	total: 80.5ms	remaining: 21.1s


## Prediction

In [18]:
def make_prediction(classifier, reg_0, reg_1, reg_2):
    #First of all we predict classes for test data
    y_pred_cl = cl.predict(eval_data.loc[:,train_columns_cl])
    _y_pred = []
    for i, features in enumerate(eval_all):
        if y_pred_cl[i] == 0:
            y_pred = model_0.predict(eval_all[i][None, ...])
            # We want to make sure that all predictions are integers and included in the predicted class
            if y_pred < 0:
                y_pred = 0
            elif y_pred > 3:
                y_pred = 3
            else:
                y_pred = np.round(y_pred)
        elif y_pred_cl[i] == 1:
            y_pred = model_1.predict(eval_all[i][None, ...])
            # We want to make sure that all predictions are integers and included in the predicted class
            if y_pred < 4:
                y_pred = 4
            elif y_pred > 90:
                y_pred = 90
            else:
                y_pred = np.round(y_pred)
        else:
            y_pred = np.exp(model_2.predict(eval_all_cl[i][None, ...]))
            # We want to make sure that all predictions are integers and included in the predicted class
            y_pred = 91 if y_pred < 91 else np.round(y_pred)
            
        _y_pred.append(y_pred)
    return np.array(_y_pred)

In [19]:
prediction = make_prediction(classifier=cl, reg_0=model_0, reg_1=model_1, reg_2=model_2)

In [None]:
#Here we create file for submission
with open("daria_1.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(prediciton):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])