In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import xgboost
from xgboost import XGBRegressor

In [2]:
train=pd.read_csv("train.csv")

In [3]:
test=pd.read_csv("test.csv")

In [4]:
train.fillna(value='', inplace=True)
test.fillna(value='', inplace=True)

In [5]:
test.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
dtype: int64

In [6]:
train = train[train['PRODUCT_LENGTH'] < train['PRODUCT_LENGTH'].quantile(0.85)]

In [7]:
train['text'] = train['BULLET_POINTS']
test['text'] = test['BULLET_POINTS']

In [8]:
train['text'] = train['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))
test['text'] = test['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))

In [9]:
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(train['text'])
X_test_counts = count_vect.transform(test['text'])

In [10]:
tfidf_vect = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vect.fit_transform(train['text'])
X_test_tfidf = tfidf_vect.transform(test['text'])

In [11]:
train_features = hstack((X_train_counts, X_train_tfidf))
test_features = hstack((X_test_counts, X_test_tfidf))

In [12]:
y=train['PRODUCT_LENGTH']
X_train, X_val, y_train, y_val = train_test_split(train_features, y, test_size=0.3, random_state=42)

In [47]:
params = {
    'objective': 'reg:squarederror',
    'eta': 0.1
}

model = XGBRegressor(
    n_estimators=600,
    early_stopping_rounds=10,
    random_state=42,
    **params
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)
test_predictions = model.predict(test_features)


[0]	validation_0-rmse:492.10149
[1]	validation_0-rmse:450.17676
[2]	validation_0-rmse:413.09015
[3]	validation_0-rmse:380.33554
[4]	validation_0-rmse:351.57698
[5]	validation_0-rmse:326.38475
[6]	validation_0-rmse:304.37053
[7]	validation_0-rmse:285.33394
[8]	validation_0-rmse:268.86035
[9]	validation_0-rmse:254.50912
[10]	validation_0-rmse:242.42909
[11]	validation_0-rmse:232.11605
[12]	validation_0-rmse:223.38570
[13]	validation_0-rmse:216.02025
[14]	validation_0-rmse:209.79599
[15]	validation_0-rmse:204.53219
[16]	validation_0-rmse:200.21289
[17]	validation_0-rmse:196.48285
[18]	validation_0-rmse:193.41342
[19]	validation_0-rmse:190.88716
[20]	validation_0-rmse:188.73507
[21]	validation_0-rmse:187.00015
[22]	validation_0-rmse:185.45375
[23]	validation_0-rmse:184.19985
[24]	validation_0-rmse:183.18704
[25]	validation_0-rmse:182.31980
[26]	validation_0-rmse:181.59464
[27]	validation_0-rmse:180.86892
[28]	validation_0-rmse:180.29131
[29]	validation_0-rmse:179.81791
[30]	validation_0-rm

In [48]:
submission_df = pd.DataFrame({'PRODUCT_ID': test["PRODUCT_ID"], 'PRODUCT_LENGTH': test_predictions})
submission_df.to_csv('hard_workinnn.csv', index=False)

In [49]:
submission_df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,549.242554
1,1729783,445.643463
2,1871949,430.214966
3,1107571,167.083664
4,624253,549.242554


In [50]:
submission_df.shape

(734736, 2)