In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE 
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_excel('tweets.xlsx') # Loading the data

In [None]:
df.head(2) #Viewing the data just to understand columns and column values

In [None]:
# Comparing scatter plot in order to check linearity in data which is not present. Hence, we cannot go
# for Linear regression model
ax1 = df.plot.scatter(x='UserTweetCount',
                      y='TweetRetweetCount')
ax2 = df.plot.scatter(x='TweetFavoritesCount',
                      y='TweetRetweetCount')
ax3 = df.plot.scatter(x='UserFollowersCount',
                      y='TweetRetweetCount')
ax4 = df.plot.scatter(x='UserFriendsCount',
                      y='TweetRetweetCount')
ax5 = df.plot.scatter(x='UserListedCount',
                      y='TweetRetweetCount')

In [None]:
# dropping not so useful columns from dataframe like Id's.
df.drop(columns=['TweetInReplyToStatusID','TweetInReplyToUserID',
                'UserID','MacroIterationNumber','TweetPlaceAttributes',
                'TweetPlaceContainedWithin'],inplace=True)
df.corr()

In [None]:
# Converting all boolean values to 0 or 1 to be represented to make it ready for model input
df['TweetRetweetFlag']=df['TweetRetweetFlag'].map(lambda x:str(x).lower())
df['TweetRetweetFlag']=df['TweetRetweetFlag'].map(lambda x:1 if x=='true' else 0)

In [None]:
# Created functions for bining the data in different class. Since most of the retweets happened at 
# lower end so giving higher weightage to value less than 500 and subsequently decreasing
def bining(x):
    if x < 500:
        return 5
    elif x>=500 and x<1000:
        return 4
    elif x>=1000 and x<2000:
        return 3
    elif x>=2000 and x< 3000:
        return 2
    else:
        return 1

In [None]:
# using above function to function to convert into bins
df['TweetFavoritesCount']=df['TweetFavoritesCount'].apply(bining)
df['UserFollowersCount']=df['UserFollowersCount'].apply(bining)
df['UserFriendsCount']=df['UserFriendsCount'].apply(bining)
df['UserListedCount']=df['UserListedCount'].apply(bining)
df['UserTweetCount']=df['UserTweetCount'].apply(bining)

In [None]:
# created function to see of hashtags used. Generally,tweets with Hashtags as only 'travel' have higher
# number of retweets
def hashtag(x):
    try:
        if x.count(',')==0 and str(x).lower()=='travel':
            return 1
        else:
            return 0
    except:
        return 0

In [None]:
df['TweetHashtags']=df['TweetHashtags'].apply(hashtag)

In [None]:
x = df[['TweetRetweetFlag','TweetFavoritesCount','UserFollowersCount',
       'UserFriendsCount','UserListedCount','UserTweetCount','TweetHashtags']]
y = df['TweetRetweetCount']

In [None]:
# Splitting data in 70:30 :: train: test
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [None]:
# Applying XGBoost regressor to train and using RMSE to valid quality of model
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', 
                  n_estimators = 10, seed = 123) 
  
# Fitting the model 
xgb_r.fit(X_train, y_train) 
  
# Predict the model 
pred = xgb_r.predict(X_test) 
  
# RMSE Computation 
rmse = np.sqrt(MSE(y_test, pred)) 
print("RMSE : % f" %(rmse)) 

### Reference:
>- https://medium.com/@teenageop/retweet-count-prediction-based-on-movie-tweets-c6ee7082c2ac