In [45]:
import csv
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from nltk.corpus import stopwords 
from collections import Counter
#import nltk
#nltk.download('stopwords')
# Load the training data
train_data = pd.read_csv("train.csv")
def hash_to_list(x):
    return [ (i.strip())[1:-1] for i in x[1:-1].split(",")]
def extract_part_of_link(x):
    return [i.split('/')[2] if i!="" else "" for i in x]
train_data["hashtags"] = train_data["hashtags"].apply(hash_to_list)
train_data["urls"] = train_data["urls"].apply(hash_to_list)
train_data["mentions"] = train_data["mentions"].apply(hash_to_list)

def column_count(x):
    return sum([d[i] if i != "" else 0 for i in x])
columns = ["urls", "mentions", "hashtags"]
for c in columns:
    d = dict()
    for urls in train_data[c]:
        if urls[0] == "":
            continue
        for url in urls:
            if url in d.keys():
                d[url]+=1
            else:
                d[url] = 1
    train_data[f"{c}_count"] = train_data[f"{c}"].apply(column_count)
    res = Counter(d.values())
    print(f"Statistic for column \"{c}\" is : {res}")
#features engeenering
#train_data["follower_friend_ratio"] = (train_data["followers_count"]+0.5)/(train_data["friends_count"]+0.5)
#train_data["follower_status_ratio"] = train_data["followers_count"]/train_data["statuses_count"]
#train_data["follower_status_ratio_likes"] = train_data["follower_status_ratio"]*train_data["favorites_count"]
#train_data["follower_friend_ratio_likes"] = train_data["follower_friend_ratio"]*train_data["favorites_count"]
#train_data["mentions"] = train_data["mentions"].apply(hash_to_list)
#train_data["urls_site"] = train_data["urls"].apply(extract_part_of_link)
# Here we split our training data into trainig and testing set. This way we can estimate the evaluation of our model without uploading to Kaggle and avoid overfitting over our evaluation dataset.
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.7, test_size=0.3)

# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

# You can examine the available features using X_train.head()
# X_train.head()
t = train_data.iloc[20]["urls"]

Statistic for column "urls" is : Counter({1: 184815, 2: 1043, 3: 51, 4: 20, 5: 8, 7: 4, 8: 3, 9: 2, 6: 2, 18: 1, 11: 1, 10: 1})
Statistic for column "mentions" is : Counter()
Statistic for column "hashtags" is : Counter({1: 7175, 2: 2108, 3: 704, 4: 444, 5: 258, 6: 207, 7: 151, 8: 106, 9: 98, 10: 61, 11: 56, 12: 42, 13: 36, 14: 35, 15: 30, 17: 27, 16: 27, 18: 25, 24: 23, 21: 20, 20: 19, 23: 16, 22: 15, 27: 14, 30: 14, 25: 14, 33: 13, 19: 12, 26: 10, 32: 9, 40: 9, 53: 8, 39: 8, 31: 8, 41: 8, 28: 8, 59: 7, 29: 7, 43: 6, 61: 6, 46: 6, 48: 6, 47: 6, 37: 5, 57: 5, 35: 5, 45: 5, 88: 4, 42: 4, 49: 4, 52: 4, 70: 4, 242: 3, 34: 3, 54: 3, 36: 3, 63: 3, 83: 3, 130: 3, 62: 3, 75: 3, 44: 3, 458: 2, 97: 2, 93: 2, 51: 2, 112: 2, 133: 2, 205: 2, 255: 2, 79: 2, 84: 2, 131: 2, 60: 2, 153: 2, 76: 2, 119: 2, 106: 2, 248: 2, 65: 2, 58: 2, 180: 2, 87: 2, 55: 2, 38: 2, 2058: 1, 1725: 1, 1032: 1, 1376: 1, 2075: 1, 406: 1, 451: 1, 2063: 1, 827: 1, 1284: 1, 13928: 1, 550: 1, 483: 1, 5629: 1, 238: 1, 230: 1, 310

In [60]:
# We set up an Tfidf Vectorizer that will use the top 100 tokens from the tweets. We also remove stopwords.
# To do that we have to fit our training dataset and then transform both the training and testing dataset. 
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
X_train_text = vectorizer.fit_transform(X_train['text'])
X_test_text = vectorizer.transform(X_test['text'])
print(X_train_text.shape)
train_columns = ["favorites_count","followers_count", "statuses_count", "friends_count", "verified", "timestamp",
                 #"follower_friend_ratio", "follower_status_ratio",# "follower_status_ratio_likes","follower_friend_ratio_likes",
                 "urls_count", "mentions_count", "hashtags_count"
                ]
print(X_train.loc[:, X_train.columns != 'text'].to_numpy().shape)
X_train_all = np.hstack((X_train.loc[:, train_columns].to_numpy(), X_train_text.toarray()))
X_test_all = np.hstack((X_test.loc[:, train_columns].to_numpy(), X_test_text.toarray()))

(247778, 100)
(247778, 13)


## GradientBoosting

In [62]:
# Now we can train our model. Here we chose a Gradient Boosting Regressor and we set our loss function 
reg = GradientBoostingRegressor(n_estimators = 500)#reg = RandomForestRegressor() #
#reg = LinearRegression()

# We fit our model using the training data
reg.fit(X_train_all,#X_train.loc[:,train_columns] ,
        y_train)
# And then we predict the values for our testing set
y_pred = reg.predict(X_test_all)#X_test.loc[:,train_columns])
# We want to make sure that all predictions are non-negative integers
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))



Prediction error: 6.059684907383865


## Catboost

In [59]:
from catboost import CatBoostRegressor, Pool, metrics, cv
from sklearn.metrics import accuracy_score

model = CatBoostRegressor(
    iterations=3000,
    random_seed=42,
)
categorical_features_indices = []

model.fit(
    X_train_all,#X_train.loc[:,train_columns], 
    y_train,
    cat_features=categorical_features_indices,
    plot=True
);

y_pred = model.predict(X_test_all)#X_test.loc[:,train_columns])
# We want to make sure that all predictions are non-negative integers
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.040044
0:	learn: 236.3338309	total: 20.4ms	remaining: 1m 1s
1:	learn: 231.2205057	total: 34.2ms	remaining: 51.3s
2:	learn: 226.5057326	total: 47ms	remaining: 46.9s
3:	learn: 221.7371808	total: 60.2ms	remaining: 45.1s
4:	learn: 217.3961957	total: 73.5ms	remaining: 44s
5:	learn: 213.2285808	total: 87.7ms	remaining: 43.8s
6:	learn: 208.9005806	total: 101ms	remaining: 43.1s
7:	learn: 204.9770461	total: 115ms	remaining: 43s
8:	learn: 201.2568021	total: 128ms	remaining: 42.5s
9:	learn: 197.6447595	total: 141ms	remaining: 42.1s
10:	learn: 194.0818238	total: 153ms	remaining: 41.6s
11:	learn: 190.7368608	total: 166ms	remaining: 41.4s
12:	learn: 187.1880894	total: 182ms	remaining: 41.9s
13:	learn: 184.1074507	total: 195ms	remaining: 41.5s
14:	learn: 181.0688547	total: 207ms	remaining: 41.2s
15:	learn: 178.0649210	total: 220ms	remaining: 41.1s
16:	learn: 175.3243404	total: 233ms	remaining: 40.8s
17:	learn: 172.5667132	total: 244ms	remaining: 40.5s
18:	learn: 169.9914767	tot

161:	learn: 77.4414938	total: 2.24s	remaining: 39.3s
162:	learn: 77.2560401	total: 2.25s	remaining: 39.2s
163:	learn: 76.9947305	total: 2.27s	remaining: 39.2s
164:	learn: 76.8709325	total: 2.28s	remaining: 39.2s
165:	learn: 76.7617337	total: 2.29s	remaining: 39.1s
166:	learn: 76.6456459	total: 2.31s	remaining: 39.1s
167:	learn: 76.3762512	total: 2.32s	remaining: 39.1s
168:	learn: 76.0998493	total: 2.33s	remaining: 39.1s
169:	learn: 75.9794056	total: 2.35s	remaining: 39.1s
170:	learn: 75.8782586	total: 2.36s	remaining: 39s
171:	learn: 75.7794591	total: 2.37s	remaining: 39s
172:	learn: 75.5905119	total: 2.38s	remaining: 38.9s
173:	learn: 75.4496800	total: 2.4s	remaining: 38.9s
174:	learn: 75.1446417	total: 2.41s	remaining: 38.9s
175:	learn: 75.0428424	total: 2.42s	remaining: 38.8s
176:	learn: 74.7889283	total: 2.43s	remaining: 38.8s
177:	learn: 74.6907521	total: 2.44s	remaining: 38.8s
178:	learn: 74.4285425	total: 2.46s	remaining: 38.7s
179:	learn: 74.1782496	total: 2.47s	remaining: 38.7

329:	learn: 59.7402787	total: 4.48s	remaining: 36.2s
330:	learn: 59.6478263	total: 4.5s	remaining: 36.2s
331:	learn: 59.4958056	total: 4.51s	remaining: 36.2s
332:	learn: 59.4042469	total: 4.52s	remaining: 36.2s
333:	learn: 59.2980807	total: 4.54s	remaining: 36.2s
334:	learn: 59.2690436	total: 4.55s	remaining: 36.2s
335:	learn: 59.1712415	total: 4.57s	remaining: 36.2s
336:	learn: 59.1512578	total: 4.58s	remaining: 36.2s
337:	learn: 59.1280002	total: 4.59s	remaining: 36.1s
338:	learn: 59.0410791	total: 4.6s	remaining: 36.1s
339:	learn: 58.9492096	total: 4.62s	remaining: 36.1s
340:	learn: 58.9281889	total: 4.63s	remaining: 36.1s
341:	learn: 58.9046595	total: 4.64s	remaining: 36.1s
342:	learn: 58.8801438	total: 4.65s	remaining: 36s
343:	learn: 58.8641219	total: 4.67s	remaining: 36s
344:	learn: 58.7802492	total: 4.68s	remaining: 36s
345:	learn: 58.7644626	total: 4.69s	remaining: 36s
346:	learn: 58.7463551	total: 4.7s	remaining: 36s
347:	learn: 58.6881049	total: 4.72s	remaining: 35.9s
348:	l

488:	learn: 51.4075594	total: 6.72s	remaining: 34.5s
489:	learn: 51.3526387	total: 6.74s	remaining: 34.5s
490:	learn: 51.2805855	total: 6.75s	remaining: 34.5s
491:	learn: 51.2511814	total: 6.77s	remaining: 34.5s
492:	learn: 51.1872476	total: 6.78s	remaining: 34.5s
493:	learn: 51.1485149	total: 6.8s	remaining: 34.5s
494:	learn: 51.1382372	total: 6.82s	remaining: 34.5s
495:	learn: 51.0962142	total: 6.83s	remaining: 34.5s
496:	learn: 51.0381474	total: 6.85s	remaining: 34.5s
497:	learn: 51.0169791	total: 6.86s	remaining: 34.5s
498:	learn: 50.9658632	total: 6.88s	remaining: 34.5s
499:	learn: 50.9571540	total: 6.89s	remaining: 34.5s
500:	learn: 50.9168031	total: 6.9s	remaining: 34.4s
501:	learn: 50.9082168	total: 6.92s	remaining: 34.4s
502:	learn: 50.8746950	total: 6.93s	remaining: 34.4s
503:	learn: 50.8326182	total: 6.95s	remaining: 34.4s
504:	learn: 50.8013946	total: 6.96s	remaining: 34.4s
505:	learn: 50.7599702	total: 6.97s	remaining: 34.4s
506:	learn: 50.7296975	total: 6.99s	remaining: 3

650:	learn: 46.9621010	total: 8.98s	remaining: 32.4s
651:	learn: 46.9320612	total: 9s	remaining: 32.4s
652:	learn: 46.8900009	total: 9.01s	remaining: 32.4s
653:	learn: 46.8489214	total: 9.02s	remaining: 32.4s
654:	learn: 46.8360337	total: 9.04s	remaining: 32.3s
655:	learn: 46.8157048	total: 9.05s	remaining: 32.3s
656:	learn: 46.8030926	total: 9.06s	remaining: 32.3s
657:	learn: 46.7748843	total: 9.08s	remaining: 32.3s
658:	learn: 46.7472053	total: 9.09s	remaining: 32.3s
659:	learn: 46.7366626	total: 9.11s	remaining: 32.3s
660:	learn: 46.6660873	total: 9.12s	remaining: 32.3s
661:	learn: 46.6387904	total: 9.14s	remaining: 32.3s
662:	learn: 46.6123405	total: 9.15s	remaining: 32.3s
663:	learn: 46.6039025	total: 9.17s	remaining: 32.3s
664:	learn: 46.5534313	total: 9.18s	remaining: 32.2s
665:	learn: 46.5281806	total: 9.2s	remaining: 32.2s
666:	learn: 46.5199262	total: 9.21s	remaining: 32.2s
667:	learn: 46.5131186	total: 9.22s	remaining: 32.2s
668:	learn: 46.4930107	total: 9.23s	remaining: 32.

812:	learn: 43.5151608	total: 11.3s	remaining: 30.3s
813:	learn: 43.4853054	total: 11.3s	remaining: 30.3s
814:	learn: 43.4152224	total: 11.3s	remaining: 30.3s
815:	learn: 43.4057165	total: 11.3s	remaining: 30.3s
816:	learn: 43.3996509	total: 11.3s	remaining: 30.3s
817:	learn: 43.3752342	total: 11.3s	remaining: 30.3s
818:	learn: 43.3693387	total: 11.4s	remaining: 30.2s
819:	learn: 43.3477299	total: 11.4s	remaining: 30.2s
820:	learn: 43.2808860	total: 11.4s	remaining: 30.2s
821:	learn: 43.2222098	total: 11.4s	remaining: 30.2s
822:	learn: 43.2066670	total: 11.4s	remaining: 30.2s
823:	learn: 43.1944323	total: 11.4s	remaining: 30.2s
824:	learn: 43.1873423	total: 11.4s	remaining: 30.2s
825:	learn: 43.1531961	total: 11.5s	remaining: 30.1s
826:	learn: 43.0894746	total: 11.5s	remaining: 30.1s
827:	learn: 43.0820334	total: 11.5s	remaining: 30.1s
828:	learn: 43.0162247	total: 11.5s	remaining: 30.1s
829:	learn: 42.9620646	total: 11.5s	remaining: 30.1s
830:	learn: 42.9251012	total: 11.5s	remaining:

976:	learn: 40.4993408	total: 13.5s	remaining: 28s
977:	learn: 40.4937316	total: 13.5s	remaining: 28s
978:	learn: 40.4872081	total: 13.5s	remaining: 28s
979:	learn: 40.4658987	total: 13.6s	remaining: 27.9s
980:	learn: 40.4416756	total: 13.6s	remaining: 27.9s
981:	learn: 40.4315855	total: 13.6s	remaining: 27.9s
982:	learn: 40.4129486	total: 13.6s	remaining: 27.9s
983:	learn: 40.3742983	total: 13.6s	remaining: 27.9s
984:	learn: 40.3685273	total: 13.6s	remaining: 27.9s
985:	learn: 40.3615283	total: 13.6s	remaining: 27.8s
986:	learn: 40.3244118	total: 13.6s	remaining: 27.8s
987:	learn: 40.2887475	total: 13.7s	remaining: 27.8s
988:	learn: 40.2681393	total: 13.7s	remaining: 27.8s
989:	learn: 40.2591500	total: 13.7s	remaining: 27.8s
990:	learn: 40.2280202	total: 13.7s	remaining: 27.8s
991:	learn: 40.1928784	total: 13.7s	remaining: 27.8s
992:	learn: 40.1794454	total: 13.7s	remaining: 27.7s
993:	learn: 40.1726707	total: 13.7s	remaining: 27.7s
994:	learn: 40.1678556	total: 13.8s	remaining: 27.7s

1142:	learn: 38.1757127	total: 15.8s	remaining: 25.6s
1143:	learn: 38.1676734	total: 15.8s	remaining: 25.6s
1144:	learn: 38.1478541	total: 15.8s	remaining: 25.6s
1145:	learn: 38.1424594	total: 15.8s	remaining: 25.6s
1146:	learn: 38.1279923	total: 15.8s	remaining: 25.6s
1147:	learn: 38.1208676	total: 15.8s	remaining: 25.5s
1148:	learn: 38.1148597	total: 15.8s	remaining: 25.5s
1149:	learn: 38.1103645	total: 15.9s	remaining: 25.5s
1150:	learn: 38.0694132	total: 15.9s	remaining: 25.5s
1151:	learn: 38.0575283	total: 15.9s	remaining: 25.5s
1152:	learn: 38.0467666	total: 15.9s	remaining: 25.5s
1153:	learn: 38.0419240	total: 15.9s	remaining: 25.5s
1154:	learn: 38.0221847	total: 15.9s	remaining: 25.4s
1155:	learn: 38.0054166	total: 15.9s	remaining: 25.4s
1156:	learn: 38.0006749	total: 16s	remaining: 25.4s
1157:	learn: 37.9892001	total: 16s	remaining: 25.4s
1158:	learn: 37.9840226	total: 16s	remaining: 25.4s
1159:	learn: 37.9768177	total: 16s	remaining: 25.4s
1160:	learn: 37.9716028	total: 16s	r

1296:	learn: 36.3683478	total: 18s	remaining: 23.7s
1297:	learn: 36.3458070	total: 18s	remaining: 23.7s
1298:	learn: 36.3251649	total: 18.1s	remaining: 23.7s
1299:	learn: 36.3188117	total: 18.1s	remaining: 23.6s
1300:	learn: 36.3043775	total: 18.1s	remaining: 23.6s
1301:	learn: 36.2873059	total: 18.1s	remaining: 23.6s
1302:	learn: 36.2790113	total: 18.1s	remaining: 23.6s
1303:	learn: 36.2653584	total: 18.1s	remaining: 23.6s
1304:	learn: 36.2574732	total: 18.1s	remaining: 23.6s
1305:	learn: 36.2484379	total: 18.2s	remaining: 23.5s
1306:	learn: 36.2418482	total: 18.2s	remaining: 23.5s
1307:	learn: 36.2370331	total: 18.2s	remaining: 23.5s
1308:	learn: 36.2323073	total: 18.2s	remaining: 23.5s
1309:	learn: 36.2152818	total: 18.2s	remaining: 23.5s
1310:	learn: 36.1960021	total: 18.2s	remaining: 23.5s
1311:	learn: 36.1910858	total: 18.2s	remaining: 23.5s
1312:	learn: 36.1894093	total: 18.2s	remaining: 23.4s
1313:	learn: 36.1784703	total: 18.3s	remaining: 23.4s
1314:	learn: 36.1693942	total: 1

1453:	learn: 34.7962861	total: 20.3s	remaining: 21.6s
1454:	learn: 34.7911436	total: 20.3s	remaining: 21.5s
1455:	learn: 34.7838026	total: 20.3s	remaining: 21.5s
1456:	learn: 34.7782401	total: 20.3s	remaining: 21.5s
1457:	learn: 34.7745795	total: 20.3s	remaining: 21.5s
1458:	learn: 34.7683551	total: 20.3s	remaining: 21.5s
1459:	learn: 34.7544829	total: 20.4s	remaining: 21.5s
1460:	learn: 34.7342605	total: 20.4s	remaining: 21.5s
1461:	learn: 34.7070548	total: 20.4s	remaining: 21.4s
1462:	learn: 34.7007284	total: 20.4s	remaining: 21.4s
1463:	learn: 34.6947368	total: 20.4s	remaining: 21.4s
1464:	learn: 34.6902048	total: 20.4s	remaining: 21.4s
1465:	learn: 34.6793732	total: 20.4s	remaining: 21.4s
1466:	learn: 34.6525231	total: 20.4s	remaining: 21.4s
1467:	learn: 34.6411834	total: 20.5s	remaining: 21.4s
1468:	learn: 34.6378321	total: 20.5s	remaining: 21.3s
1469:	learn: 34.6315779	total: 20.5s	remaining: 21.3s
1470:	learn: 34.6264289	total: 20.5s	remaining: 21.3s
1471:	learn: 34.6034344	tota

1612:	learn: 33.1531492	total: 22.5s	remaining: 19.4s
1613:	learn: 33.1486329	total: 22.5s	remaining: 19.4s
1614:	learn: 33.1344440	total: 22.6s	remaining: 19.3s
1615:	learn: 33.1241361	total: 22.6s	remaining: 19.3s
1616:	learn: 33.1189034	total: 22.6s	remaining: 19.3s
1617:	learn: 33.1100711	total: 22.6s	remaining: 19.3s
1618:	learn: 33.0993218	total: 22.6s	remaining: 19.3s
1619:	learn: 33.0966966	total: 22.6s	remaining: 19.3s
1620:	learn: 33.0905373	total: 22.7s	remaining: 19.3s
1621:	learn: 33.0749297	total: 22.7s	remaining: 19.3s
1622:	learn: 33.0667537	total: 22.7s	remaining: 19.2s
1623:	learn: 33.0593589	total: 22.7s	remaining: 19.2s
1624:	learn: 33.0444308	total: 22.7s	remaining: 19.2s
1625:	learn: 33.0285543	total: 22.7s	remaining: 19.2s
1626:	learn: 33.0178390	total: 22.7s	remaining: 19.2s
1627:	learn: 32.9992964	total: 22.8s	remaining: 19.2s
1628:	learn: 32.9961586	total: 22.8s	remaining: 19.2s
1629:	learn: 32.9885035	total: 22.8s	remaining: 19.2s
1630:	learn: 32.9841075	tota

1769:	learn: 31.6113533	total: 24.8s	remaining: 17.2s
1770:	learn: 31.6087525	total: 24.8s	remaining: 17.2s
1771:	learn: 31.6055438	total: 24.8s	remaining: 17.2s
1772:	learn: 31.5943814	total: 24.8s	remaining: 17.2s
1773:	learn: 31.5856782	total: 24.8s	remaining: 17.2s
1774:	learn: 31.5813587	total: 24.9s	remaining: 17.2s
1775:	learn: 31.5775478	total: 24.9s	remaining: 17.1s
1776:	learn: 31.5628422	total: 24.9s	remaining: 17.1s
1777:	learn: 31.5442389	total: 24.9s	remaining: 17.1s
1778:	learn: 31.5398505	total: 24.9s	remaining: 17.1s
1779:	learn: 31.5237214	total: 24.9s	remaining: 17.1s
1780:	learn: 31.5145968	total: 24.9s	remaining: 17.1s
1781:	learn: 31.4983167	total: 24.9s	remaining: 17.1s
1782:	learn: 31.4923134	total: 25s	remaining: 17s
1783:	learn: 31.4833180	total: 25s	remaining: 17s
1784:	learn: 31.4720134	total: 25s	remaining: 17s
1785:	learn: 31.4694545	total: 25s	remaining: 17s
1786:	learn: 31.4630587	total: 25s	remaining: 17s
1787:	learn: 31.4572611	total: 25s	remaining: 17

1927:	learn: 30.4223686	total: 27s	remaining: 15s
1928:	learn: 30.4150290	total: 27s	remaining: 15s
1929:	learn: 30.4087004	total: 27s	remaining: 15s
1930:	learn: 30.4029451	total: 27s	remaining: 15s
1931:	learn: 30.3991249	total: 27.1s	remaining: 15s
1932:	learn: 30.3964544	total: 27.1s	remaining: 14.9s
1933:	learn: 30.3946422	total: 27.1s	remaining: 14.9s
1934:	learn: 30.3904675	total: 27.1s	remaining: 14.9s
1935:	learn: 30.3841289	total: 27.1s	remaining: 14.9s
1936:	learn: 30.3811190	total: 27.1s	remaining: 14.9s
1937:	learn: 30.3714915	total: 27.1s	remaining: 14.9s
1938:	learn: 30.3643484	total: 27.1s	remaining: 14.9s
1939:	learn: 30.3625596	total: 27.2s	remaining: 14.8s
1940:	learn: 30.3569879	total: 27.2s	remaining: 14.8s
1941:	learn: 30.3483762	total: 27.2s	remaining: 14.8s
1942:	learn: 30.3454498	total: 27.2s	remaining: 14.8s
1943:	learn: 30.3337082	total: 27.2s	remaining: 14.8s
1944:	learn: 30.3319447	total: 27.2s	remaining: 14.8s
1945:	learn: 30.3274758	total: 27.2s	remaining

2095:	learn: 29.2860375	total: 29.2s	remaining: 12.6s
2096:	learn: 29.2813995	total: 29.3s	remaining: 12.6s
2097:	learn: 29.2768531	total: 29.3s	remaining: 12.6s
2098:	learn: 29.2719450	total: 29.3s	remaining: 12.6s
2099:	learn: 29.2626145	total: 29.3s	remaining: 12.6s
2100:	learn: 29.2598331	total: 29.3s	remaining: 12.5s
2101:	learn: 29.2568423	total: 29.3s	remaining: 12.5s
2102:	learn: 29.2457891	total: 29.3s	remaining: 12.5s
2103:	learn: 29.2357232	total: 29.4s	remaining: 12.5s
2104:	learn: 29.2333966	total: 29.4s	remaining: 12.5s
2105:	learn: 29.2311233	total: 29.4s	remaining: 12.5s
2106:	learn: 29.2211221	total: 29.4s	remaining: 12.5s
2107:	learn: 29.2159135	total: 29.4s	remaining: 12.4s
2108:	learn: 29.1945664	total: 29.4s	remaining: 12.4s
2109:	learn: 29.1922137	total: 29.4s	remaining: 12.4s
2110:	learn: 29.1823719	total: 29.4s	remaining: 12.4s
2111:	learn: 29.1800707	total: 29.5s	remaining: 12.4s
2112:	learn: 29.1702508	total: 29.5s	remaining: 12.4s
2113:	learn: 29.1677109	tota

2259:	learn: 28.2473001	total: 31.5s	remaining: 10.3s
2260:	learn: 28.2417447	total: 31.5s	remaining: 10.3s
2261:	learn: 28.2393280	total: 31.5s	remaining: 10.3s
2262:	learn: 28.2359609	total: 31.5s	remaining: 10.3s
2263:	learn: 28.2304052	total: 31.5s	remaining: 10.2s
2264:	learn: 28.2274289	total: 31.5s	remaining: 10.2s
2265:	learn: 28.2262227	total: 31.6s	remaining: 10.2s
2266:	learn: 28.2225510	total: 31.6s	remaining: 10.2s
2267:	learn: 28.2128529	total: 31.6s	remaining: 10.2s
2268:	learn: 28.2060786	total: 31.6s	remaining: 10.2s
2269:	learn: 28.2024028	total: 31.6s	remaining: 10.2s
2270:	learn: 28.1991353	total: 31.6s	remaining: 10.2s
2271:	learn: 28.1955050	total: 31.6s	remaining: 10.1s
2272:	learn: 28.1889921	total: 31.7s	remaining: 10.1s
2273:	learn: 28.1828558	total: 31.7s	remaining: 10.1s
2274:	learn: 28.1763153	total: 31.7s	remaining: 10.1s
2275:	learn: 28.1743392	total: 31.7s	remaining: 10.1s
2276:	learn: 28.1658387	total: 31.7s	remaining: 10.1s
2277:	learn: 28.1534033	tota

2420:	learn: 27.2418436	total: 33.7s	remaining: 8.07s
2421:	learn: 27.2391527	total: 33.7s	remaining: 8.05s
2422:	learn: 27.2356396	total: 33.8s	remaining: 8.04s
2423:	learn: 27.2338516	total: 33.8s	remaining: 8.02s
2424:	learn: 27.2267805	total: 33.8s	remaining: 8.01s
2425:	learn: 27.2224308	total: 33.8s	remaining: 8s
2426:	learn: 27.2147882	total: 33.8s	remaining: 7.99s
2427:	learn: 27.2118751	total: 33.8s	remaining: 7.97s
2428:	learn: 27.2104121	total: 33.9s	remaining: 7.96s
2429:	learn: 27.2079255	total: 33.9s	remaining: 7.95s
2430:	learn: 27.2050882	total: 33.9s	remaining: 7.93s
2431:	learn: 27.1976855	total: 33.9s	remaining: 7.92s
2432:	learn: 27.1938903	total: 33.9s	remaining: 7.9s
2433:	learn: 27.1932182	total: 33.9s	remaining: 7.89s
2434:	learn: 27.1895524	total: 33.9s	remaining: 7.88s
2435:	learn: 27.1862815	total: 34s	remaining: 7.86s
2436:	learn: 27.1826265	total: 34s	remaining: 7.85s
2437:	learn: 27.1799270	total: 34s	remaining: 7.83s
2438:	learn: 27.1741871	total: 34s	rem

2575:	learn: 26.4115075	total: 36s	remaining: 5.92s
2576:	learn: 26.4096016	total: 36s	remaining: 5.91s
2577:	learn: 26.4074279	total: 36s	remaining: 5.89s
2578:	learn: 26.4063692	total: 36s	remaining: 5.88s
2579:	learn: 26.4046890	total: 36s	remaining: 5.87s
2580:	learn: 26.4037860	total: 36s	remaining: 5.85s
2581:	learn: 26.3985098	total: 36.1s	remaining: 5.84s
2582:	learn: 26.3927195	total: 36.1s	remaining: 5.82s
2583:	learn: 26.3841606	total: 36.1s	remaining: 5.81s
2584:	learn: 26.3764871	total: 36.1s	remaining: 5.79s
2585:	learn: 26.3687701	total: 36.1s	remaining: 5.78s
2586:	learn: 26.3581215	total: 36.1s	remaining: 5.77s
2587:	learn: 26.3555842	total: 36.1s	remaining: 5.75s
2588:	learn: 26.3531071	total: 36.1s	remaining: 5.74s
2589:	learn: 26.3501937	total: 36.2s	remaining: 5.72s
2590:	learn: 26.3476926	total: 36.2s	remaining: 5.71s
2591:	learn: 26.3410716	total: 36.2s	remaining: 5.7s
2592:	learn: 26.3326226	total: 36.2s	remaining: 5.68s
2593:	learn: 26.3267864	total: 36.2s	rema

2734:	learn: 25.6847179	total: 38.2s	remaining: 3.7s
2735:	learn: 25.6696670	total: 38.2s	remaining: 3.69s
2736:	learn: 25.6685462	total: 38.2s	remaining: 3.67s
2737:	learn: 25.6657420	total: 38.3s	remaining: 3.66s
2738:	learn: 25.6643262	total: 38.3s	remaining: 3.65s
2739:	learn: 25.6622466	total: 38.3s	remaining: 3.63s
2740:	learn: 25.6597228	total: 38.3s	remaining: 3.62s
2741:	learn: 25.6541197	total: 38.3s	remaining: 3.6s
2742:	learn: 25.6499775	total: 38.3s	remaining: 3.59s
2743:	learn: 25.6476430	total: 38.3s	remaining: 3.58s
2744:	learn: 25.6447213	total: 38.3s	remaining: 3.56s
2745:	learn: 25.6358282	total: 38.4s	remaining: 3.55s
2746:	learn: 25.6335629	total: 38.4s	remaining: 3.53s
2747:	learn: 25.6246098	total: 38.4s	remaining: 3.52s
2748:	learn: 25.6162982	total: 38.4s	remaining: 3.51s
2749:	learn: 25.6073016	total: 38.4s	remaining: 3.49s
2750:	learn: 25.5964592	total: 38.4s	remaining: 3.48s
2751:	learn: 25.5911635	total: 38.4s	remaining: 3.46s
2752:	learn: 25.5888188	total:

2896:	learn: 24.8081073	total: 40.5s	remaining: 1.44s
2897:	learn: 24.7997896	total: 40.5s	remaining: 1.42s
2898:	learn: 24.7945004	total: 40.5s	remaining: 1.41s
2899:	learn: 24.7893949	total: 40.5s	remaining: 1.4s
2900:	learn: 24.7858514	total: 40.5s	remaining: 1.38s
2901:	learn: 24.7834931	total: 40.5s	remaining: 1.37s
2902:	learn: 24.7786436	total: 40.6s	remaining: 1.35s
2903:	learn: 24.7729940	total: 40.6s	remaining: 1.34s
2904:	learn: 24.7710007	total: 40.6s	remaining: 1.33s
2905:	learn: 24.7656133	total: 40.6s	remaining: 1.31s
2906:	learn: 24.7621700	total: 40.6s	remaining: 1.3s
2907:	learn: 24.7567763	total: 40.6s	remaining: 1.28s
2908:	learn: 24.7455150	total: 40.6s	remaining: 1.27s
2909:	learn: 24.7404231	total: 40.7s	remaining: 1.26s
2910:	learn: 24.7350736	total: 40.7s	remaining: 1.24s
2911:	learn: 24.7281153	total: 40.7s	remaining: 1.23s
2912:	learn: 24.7183597	total: 40.7s	remaining: 1.22s
2913:	learn: 24.7153638	total: 40.7s	remaining: 1.2s
2914:	learn: 24.7140426	total: 

## Finalized

In [7]:
###################################
# Once we finalized our features and model we can train it using the whole training set and then produce prediction for the evaluating dataset
###################################
# Load the evaluation data
eval_data = pd.read_csv("evaluation.csv")
# Transform our data into tfidf vectors
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
train_data = pd.read_csv("train.csv")
y_train = train_data['retweets_count']
X_train = vectorizer.fit_transform(train_data['text'])
train_data = train_data.loc[:,train_columns]
# We fit our model using the training data
reg = GradientBoostingRegressor()
reg.fit(train_data, y_train)
X_val = vectorizer.transform(eval_data['text'])
eval_data_tr = eval_data.loc[:,train_columns]
# Predict the number of retweets for the evaluation dataset
y_pred = reg.predict(eval_data_tr)
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]
# Dump the results into a file that follows the required Kaggle template
with open("gbr_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])


### Catboost

In [9]:
###################################
# Once we finalized our features and model we can train it using the whole training set and then produce prediction for the evaluating dataset
###################################
# Load the evaluation data
eval_data = pd.read_csv("evaluation.csv")
# Transform our data into tfidf vectors
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
train_data = pd.read_csv("train.csv")
y_train = train_data['retweets_count']
X_train = vectorizer.fit_transform(train_data['text'])

train_data = train_data.loc[:,train_columns]
# We fit our model using the training data
model = CatBoostRegressor(
    random_seed=42,
)
categorical_features_indices = []
model.fit(
    train_data, y_train,
    cat_features=categorical_features_indices,
    plot=True
)
X_val = vectorizer.transform(eval_data['text'])
eval_data_tr = eval_data.loc[:,train_columns]
# Predict the number of retweets for the evaluation dataset
y_pred = model.predict(eval_data_tr)
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]
# Dump the results into a file that follows the required Kaggle template
with open("catboost_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.103494
0:	learn: 227.3084694	total: 19.8ms	remaining: 19.8s
1:	learn: 214.1015269	total: 31.7ms	remaining: 15.8s
2:	learn: 202.3536656	total: 41.8ms	remaining: 13.9s
3:	learn: 192.0202205	total: 52.9ms	remaining: 13.2s
4:	learn: 183.1236626	total: 64.4ms	remaining: 12.8s
5:	learn: 174.4972183	total: 76.1ms	remaining: 12.6s
6:	learn: 167.2204251	total: 88.3ms	remaining: 12.5s
7:	learn: 160.7018324	total: 101ms	remaining: 12.5s
8:	learn: 154.4313337	total: 112ms	remaining: 12.3s
9:	learn: 148.3815698	total: 122ms	remaining: 12.1s
10:	learn: 143.4898758	total: 133ms	remaining: 12s
11:	learn: 139.4225643	total: 144ms	remaining: 11.9s
12:	learn: 134.9648654	total: 154ms	remaining: 11.7s
13:	learn: 131.1704842	total: 165ms	remaining: 11.6s
14:	learn: 127.5485716	total: 176ms	remaining: 11.6s
15:	learn: 123.9356973	total: 189ms	remaining: 11.6s
16:	learn: 120.7580422	total: 201ms	remaining: 11.6s
17:	learn: 118.1206485	total: 212ms	remaining: 11.6s
18:	learn: 115.698726

161:	learn: 57.6980306	total: 1.8s	remaining: 9.31s
162:	learn: 57.6011865	total: 1.81s	remaining: 9.3s
163:	learn: 57.4410389	total: 1.82s	remaining: 9.29s
164:	learn: 57.2944158	total: 1.83s	remaining: 9.28s
165:	learn: 57.1977987	total: 1.84s	remaining: 9.27s
166:	learn: 57.1108058	total: 1.86s	remaining: 9.27s
167:	learn: 56.9330071	total: 1.87s	remaining: 9.26s
168:	learn: 56.8040478	total: 1.88s	remaining: 9.25s
169:	learn: 56.7393698	total: 1.89s	remaining: 9.24s
170:	learn: 56.5818755	total: 1.9s	remaining: 9.23s
171:	learn: 56.5040743	total: 1.92s	remaining: 9.22s
172:	learn: 56.3598442	total: 1.92s	remaining: 9.2s
173:	learn: 56.2094935	total: 1.94s	remaining: 9.19s
174:	learn: 56.0855090	total: 1.95s	remaining: 9.18s
175:	learn: 55.9812128	total: 1.96s	remaining: 9.16s
176:	learn: 55.9277675	total: 1.97s	remaining: 9.15s
177:	learn: 55.8715533	total: 1.98s	remaining: 9.14s
178:	learn: 55.7484590	total: 1.99s	remaining: 9.12s
179:	learn: 55.6219516	total: 2s	remaining: 9.11s


331:	learn: 45.0677054	total: 3.62s	remaining: 7.28s
332:	learn: 45.0463603	total: 3.63s	remaining: 7.28s
333:	learn: 45.0332898	total: 3.64s	remaining: 7.26s
334:	learn: 44.9657457	total: 3.65s	remaining: 7.25s
335:	learn: 44.9439451	total: 3.66s	remaining: 7.24s
336:	learn: 44.9213435	total: 3.67s	remaining: 7.23s
337:	learn: 44.8684566	total: 3.68s	remaining: 7.22s
338:	learn: 44.7711529	total: 3.69s	remaining: 7.21s
339:	learn: 44.7245457	total: 3.71s	remaining: 7.2s
340:	learn: 44.7153633	total: 3.72s	remaining: 7.18s
341:	learn: 44.6731840	total: 3.73s	remaining: 7.17s
342:	learn: 44.6212241	total: 3.74s	remaining: 7.16s
343:	learn: 44.5779885	total: 3.75s	remaining: 7.14s
344:	learn: 44.5126231	total: 3.76s	remaining: 7.13s
345:	learn: 44.4657576	total: 3.77s	remaining: 7.12s
346:	learn: 44.4242085	total: 3.78s	remaining: 7.11s
347:	learn: 44.3903432	total: 3.79s	remaining: 7.1s
348:	learn: 44.3519818	total: 3.8s	remaining: 7.09s
349:	learn: 44.3394519	total: 3.81s	remaining: 7.

502:	learn: 39.0364308	total: 5.43s	remaining: 5.37s
503:	learn: 39.0207314	total: 5.45s	remaining: 5.36s
504:	learn: 39.0145767	total: 5.46s	remaining: 5.35s
505:	learn: 38.9798187	total: 5.47s	remaining: 5.34s
506:	learn: 38.9305850	total: 5.48s	remaining: 5.33s
507:	learn: 38.8840633	total: 5.49s	remaining: 5.32s
508:	learn: 38.8643328	total: 5.5s	remaining: 5.31s
509:	learn: 38.8468196	total: 5.51s	remaining: 5.29s
510:	learn: 38.8238070	total: 5.52s	remaining: 5.28s
511:	learn: 38.7977392	total: 5.53s	remaining: 5.27s
512:	learn: 38.7660409	total: 5.54s	remaining: 5.26s
513:	learn: 38.7472116	total: 5.55s	remaining: 5.25s
514:	learn: 38.7242469	total: 5.56s	remaining: 5.24s
515:	learn: 38.7026812	total: 5.57s	remaining: 5.23s
516:	learn: 38.6839779	total: 5.58s	remaining: 5.22s
517:	learn: 38.6421495	total: 5.59s	remaining: 5.21s
518:	learn: 38.6209332	total: 5.6s	remaining: 5.19s
519:	learn: 38.6020252	total: 5.61s	remaining: 5.18s
520:	learn: 38.5790047	total: 5.62s	remaining: 5

675:	learn: 35.1609187	total: 7.25s	remaining: 3.48s
676:	learn: 35.1453969	total: 7.26s	remaining: 3.46s
677:	learn: 35.1379038	total: 7.27s	remaining: 3.45s
678:	learn: 35.1295658	total: 7.28s	remaining: 3.44s
679:	learn: 35.1175897	total: 7.29s	remaining: 3.43s
680:	learn: 35.1107010	total: 7.3s	remaining: 3.42s
681:	learn: 35.1009570	total: 7.31s	remaining: 3.41s
682:	learn: 35.0839237	total: 7.32s	remaining: 3.4s
683:	learn: 35.0607331	total: 7.33s	remaining: 3.39s
684:	learn: 35.0437084	total: 7.35s	remaining: 3.38s
685:	learn: 35.0189172	total: 7.36s	remaining: 3.37s
686:	learn: 34.9716612	total: 7.37s	remaining: 3.36s
687:	learn: 34.9505548	total: 7.38s	remaining: 3.35s
688:	learn: 34.9285275	total: 7.38s	remaining: 3.33s
689:	learn: 34.9140667	total: 7.39s	remaining: 3.32s
690:	learn: 34.9083112	total: 7.4s	remaining: 3.31s
691:	learn: 34.8932655	total: 7.42s	remaining: 3.3s
692:	learn: 34.8828484	total: 7.42s	remaining: 3.29s
693:	learn: 34.8541197	total: 7.43s	remaining: 3.2

848:	learn: 32.1850593	total: 9.07s	remaining: 1.61s
849:	learn: 32.1775824	total: 9.08s	remaining: 1.6s
850:	learn: 32.1665964	total: 9.09s	remaining: 1.59s
851:	learn: 32.1572649	total: 9.1s	remaining: 1.58s
852:	learn: 32.1463956	total: 9.11s	remaining: 1.57s
853:	learn: 32.1427383	total: 9.12s	remaining: 1.56s
854:	learn: 32.1310498	total: 9.13s	remaining: 1.55s
855:	learn: 32.1175689	total: 9.14s	remaining: 1.54s
856:	learn: 32.1022674	total: 9.16s	remaining: 1.53s
857:	learn: 32.0852500	total: 9.16s	remaining: 1.52s
858:	learn: 32.0757852	total: 9.18s	remaining: 1.51s
859:	learn: 32.0574366	total: 9.19s	remaining: 1.5s
860:	learn: 32.0384891	total: 9.2s	remaining: 1.48s
861:	learn: 32.0293659	total: 9.21s	remaining: 1.47s
862:	learn: 32.0263652	total: 9.22s	remaining: 1.46s
863:	learn: 32.0076752	total: 9.23s	remaining: 1.45s
864:	learn: 31.9967468	total: 9.24s	remaining: 1.44s
865:	learn: 31.9802655	total: 9.25s	remaining: 1.43s
866:	learn: 31.9541071	total: 9.26s	remaining: 1.4

In [None]:
###################################
# Once we finalized our features and model we can train it using the whole training set and then produce prediction for the evaluating dataset
###################################
# Load the evaluation data
eval_data = pd.read_csv("evaluation.csv")
# Transform our data into tfidf vectors
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
train_data = pd.read_csv("train.csv")
def hash_to_list(x):
    return [ (i.strip())[1:-1] for i in x[1:-1].split(",")]
def extract_part_of_link(x):
    return [i.split('/')[2] if i!="" else "" for i in x]
train_data["hashtags"] = train_data["hashtags"].apply(hash_to_list)
train_data["urls"] = train_data["urls"].apply(hash_to_list)
train_data["mentions"] = train_data["mentions"].apply(hash_to_list)

def column_count(x):
    return sum([d[i] if i != "" else 0 for i in x])
columns = ["urls", "mentions", "hashtags"]
for c in columns:
    d = dict()
    for urls in train_data[c]:
        if urls[0] == "":
            continue
        for url in urls:
            if url in d.keys():
                d[url]+=1
            else:
                d[url] = 1
    train_data[f"{c}_count"] = train_data[f"{c}"].apply(column_count)
    res = Counter(d.values())
y_train = train_data['retweets_count']
X_train_text = vectorizer.fit_transform(train_data['text'])
train_data = train_data.loc[:,train_columns]

X_train_all = np.hstack((train_data.to_numpy(), X_train_text.toarray()))

# We fit our model using the training data
reg = GradientBoostingRegressor(n_estimators = 500)
reg.fit(X_train_all, y_train)
X_val_text = vectorizer.transform(eval_data['text'])
eval_data_tr = eval_data.loc[:,train_columns]
X_val_all = np.hstack((eval_data_tr.to_numpy(), X_val_text.toarray()))
# Predict the number of retweets for the evaluation dataset
y_pred = reg.predict(X_val_all)
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]
# Dump the results into a file that follows the required Kaggle template
with open("gbr_predictions_hash_feat_500.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['TweetID'].iloc[index]) , str(int(prediction))])