In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv
/kaggle/input/feedback-prize-english-language-learning/train.csv
/kaggle/input/feedback-prize-english-language-learning/test.csv


In [2]:
import nltk
import datetime as dt
import seaborn as sns

# For Text pre-processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Sklearn
# Model
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [3]:
train_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')

In [4]:
train_df.describe()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
count,3911.0,3911.0,3911.0,3911.0,3911.0,3911.0
mean,3.127077,3.028254,3.235745,3.11685,3.032856,3.081053
std,0.662542,0.644399,0.583148,0.655997,0.699841,0.67145
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.5,2.5,3.0,2.5,2.5,2.5
50%,3.0,3.0,3.0,3.0,3.0,3.0
75%,3.5,3.5,3.5,3.5,3.5,3.5
max,5.0,5.0,5.0,5.0,5.0,5.0


In [5]:
train_df.isnull().sum()

text_id        0
full_text      0
cohesion       0
syntax         0
vocabulary     0
phraseology    0
grammar        0
conventions    0
dtype: int64

In [6]:
import re
import string
def clean_p1(sen):
    sen = str(sen).lower()
    sen = re.sub('\[.*?\]', '', sen)
    sen = re.sub('https?://\S+|www\.\S+', '', sen)
    sen = re.sub('<.*?>+', '', sen)
    sen = re.sub('[%s]' % re.escape(string.punctuation), '', sen)
    sen = re.sub('\n', '', sen)
    sen = re.sub('\w*\d\w*', '', sen)
    return sen

In [7]:
# cleaning the text
def textPre(sen):
    sen = str(clean_p1(sen))
    sen = sen.lower()
    
    # below methods affecting the vocab
    #remove
    #filtered_words = [w for w in sen if w not in stopwords.words('english')]
    
    #stemming
    #stem = PorterStemmer()
    #stem_words = [stem.stem(w) for w in filtered_words]
    
    #lemmatize
    #lemma = WordNetLemmatizer()
    #lemma_words = [lemma.lemmatize(w) for w in filtered_words]
    
    return ' '.join(word_tokenize(sen))

In [8]:
train_df['full_text'] = train_df['full_text'].apply(textPre)
train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,i think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,when a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,dear principalif u change the school policy of...,3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,the best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [9]:
pred = train_df['full_text']
tar1 = train_df['cohesion']
tar2 = train_df['syntax']
tar3 = train_df['vocabulary']
tar4 = train_df['phraseology']
tar5 = train_df['grammar']
tar6 = train_df['conventions']

In [10]:
pred

0       i think that students would benefit from learn...
1       when a problem is a change you have to let it ...
2       dear principalif u change the school policy of...
3       the best time in life is when you become yours...
4       small act of kindness can impact in other peop...
                              ...                        
3906    i believe using cellphones in class for educat...
3907    working alone students do not have to argue wi...
3908    a problem is a chance for you to do your best ...
3909    many people disagree with albert schweitzers q...
3910    do you think that failure is the main thing fo...
Name: full_text, Length: 3911, dtype: object

In [11]:
#tfidf vectorizer
vec = TfidfVectorizer(max_df = 0.7, min_df = 5)
vec

In [12]:
x_train, x_val, y_train_1, y_val_1 = train_test_split(pred, tar1, test_size=0.2, random_state=1)

In [13]:
print(x_train.shape, y_train_1.shape)
print(x_val.shape, y_val_1.shape)

(3128,) (3128,)
(783,) (783,)


In [14]:
x_train_vec = vec.fit_transform(x_train)
x_train_vec

<3128x4583 sparse matrix of type '<class 'numpy.float64'>'
	with 369864 stored elements in Compressed Sparse Row format>

In [15]:
df_train = pd.DataFrame.sparse.from_spmatrix(x_train_vec, columns=vec.get_feature_names_out())
df_train.sample(10)

Unnamed: 0,abilities,ability,able,about,above,absent,absolute,absolutely,abuse,abut,...,youthe,youtube,youve,youwhen,youyou,ypu,yuo,zero,zone,zoo
548,0.0,0.0,0.060256,0.035553,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2230,0.0,0.0,0.0,0.018036,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,0.0,0.0,0.023937,0.028247,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2687,0.0,0.0,0.0,0.015202,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484,0.0,0.0,0.0,0.058524,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2489,0.0,0.0,0.0,0.037031,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
x_val_vec = vec.transform(x_val) # dont need to call fit selector again
x_val_vec

<783x4583 sparse matrix of type '<class 'numpy.float64'>'
	with 93727 stored elements in Compressed Sparse Row format>

In [17]:
x_train.shape

(3128,)

In [18]:
x_val.shape

(783,)

In [19]:
x_train, x_val, y_train_2, y_val_2 = train_test_split(pred, tar2, test_size=0.2, random_state=42)

In [20]:
x_train, x_val, y_train_3, y_val_3 = train_test_split(pred, tar3, test_size=0.2, random_state=42)

In [21]:
x_train, x_val, y_train_4, y_val_4 = train_test_split(pred, tar4, test_size=0.2, random_state=42)

In [22]:
x_train, x_val, y_train_5, y_val_5 = train_test_split(pred, tar5, test_size=0.2, random_state=42)

In [23]:
x_train, x_val, y_train_6, y_val_6 = train_test_split(pred, tar6, test_size=0.2, random_state=42)

In [24]:
#ridge regression
ridge_reg1=GridSearchCV(Ridge(), param_grid={'alpha':np.linspace(0.0, 50, 20)}, cv=5)
ridge_model1=ridge_reg1.fit(x_train_vec, y_train_1)
print('ridge best_para', ridge_reg1.best_params_)
print('ridge best_score', ridge_reg1.best_score_)
ridge_r2_score_train1=r2_score(y_train_1,ridge_model1.predict(x_train_vec))
ridge_r2_score_check1=r2_score(y_val_1,ridge_model1.predict(x_val_vec))
print('ridge_r2_score_train=',ridge_r2_score_train1)
print('ridge_r2_score_check=',ridge_r2_score_check1)
print('rmse=', mean_squared_error(y_val_1, ridge_model1.predict(x_val_vec), squared=False))

ridge best_para {'alpha': 2.6315789473684212}
ridge best_score 0.2549779222014542
ridge_r2_score_train= 0.4739442188386165
ridge_r2_score_check= 0.24158590009362024
rmse= 0.5928241776782446


In [25]:
ridge_reg2=GridSearchCV(Ridge(), param_grid={'alpha':np.linspace(0.0, 50, 20)}, cv=5)
ridge_model2=ridge_reg2.fit(x_train_vec, y_train_2)
print('ridge best_para', ridge_reg2.best_params_)
print('ridge best_score', ridge_reg2.best_score_)
ridge_r2_score_train2=r2_score(y_train_2,ridge_model2.predict(x_train_vec))
ridge_r2_score_check2=r2_score(y_val_2,ridge_model2.predict(x_val_vec))
print('ridge_r2_score_train=',ridge_r2_score_train2)
print('ridge_r2_score_check=',ridge_r2_score_check2)
print('rmse=', mean_squared_error(y_val_2, ridge_model2.predict(x_val_vec), squared=False))

ridge best_para {'alpha': 50.0}
ridge best_score -0.0014503121092207306
ridge_r2_score_train= 0.031487546963153146
ridge_r2_score_check= 0.0021644906113360562
rmse= 0.6505049879640902


In [26]:
ridge_reg3=GridSearchCV(Ridge(), param_grid={'alpha':np.linspace(0.0, 50, 20)}, cv=5)
ridge_model3=ridge_reg3.fit(x_train_vec, y_train_3)
print('ridge best_para', ridge_reg3.best_params_)
print('ridge best_score', ridge_reg3.best_score_)
ridge_r2_score_train3=r2_score(y_train_3,ridge_model3.predict(x_train_vec))
ridge_r2_score_check3=r2_score(y_val_3,ridge_model3.predict(x_val_vec))
print('ridge_r2_score_train=',ridge_r2_score_train3)
print('ridge_r2_score_check=',ridge_r2_score_check3)
print('rmse=', mean_squared_error(y_val_3, ridge_model3.predict(x_val_vec), squared=False))

ridge best_para {'alpha': 50.0}
ridge best_score -0.000395679311694086
ridge_r2_score_train= 0.03082515694525878
ridge_r2_score_check= -0.005033160528871816
rmse= 0.5752922540030835


In [27]:
ridge_reg4=GridSearchCV(Ridge(), param_grid={'alpha':np.linspace(0.0, 50, 20)}, cv=5)
ridge_model4=ridge_reg4.fit(x_train_vec, y_train_4)
print('ridge best_para', ridge_reg4.best_params_)
print('ridge best_score', ridge_reg4.best_score_)
ridge_r2_score_train4=r2_score(y_train_4,ridge_model4.predict(x_train_vec))
ridge_r2_score_check4=r2_score(y_val_4,ridge_model4.predict(x_val_vec))
print('ridge_r2_score_train=',ridge_r2_score_train4)
print('ridge_r2_score_check=',ridge_r2_score_check4)
print('rmse=', mean_squared_error(y_val_4, ridge_model4.predict(x_val_vec), squared=False))

ridge best_para {'alpha': 50.0}
ridge best_score -0.0020289352111645177
ridge_r2_score_train= 0.028914200054741324
ridge_r2_score_check= -0.0012268098194148092
rmse= 0.6587947015934494


In [28]:
ridge_reg5=GridSearchCV(Ridge(), param_grid={'alpha':np.linspace(0.0, 50, 20)}, cv=5)
ridge_model5=ridge_reg5.fit(x_train_vec, y_train_5)
print('ridge best_para', ridge_reg5.best_params_)
print('ridge best_score', ridge_reg5.best_score_)
ridge_r2_score_train5=r2_score(y_train_5,ridge_model5.predict(x_train_vec))
ridge_r2_score_check5=r2_score(y_val_5,ridge_model5.predict(x_val_vec))
print('ridge_r2_score_train=',ridge_r2_score_train5)
print('ridge_r2_score_check=',ridge_r2_score_check5)
print('rmse=', mean_squared_error(y_val_5, ridge_model5.predict(x_val_vec), squared=False))

ridge best_para {'alpha': 50.0}
ridge best_score 0.00010915648783353938
ridge_r2_score_train= 0.0322814123847297
ridge_r2_score_check= 0.0002701893187990745
rmse= 0.6961483045314242


In [29]:
ridge_reg6=GridSearchCV(Ridge(), param_grid={'alpha':np.linspace(0.0, 50, 20)}, cv=5)
ridge_model6=ridge_reg6.fit(x_train_vec, y_train_6)
print('ridge best_para', ridge_reg6.best_params_)
print('ridge best_score', ridge_reg6.best_score_)
ridge_r2_score_train6=r2_score(y_train_6,ridge_model6.predict(x_train_vec))
ridge_r2_score_check6=r2_score(y_val_6,ridge_model6.predict(x_val_vec))
print('ridge_r2_score_train=',ridge_r2_score_train6)
print('ridge_r2_score_check=',ridge_r2_score_check6)
print('rmse=', mean_squared_error(y_val_6, ridge_model6.predict(x_val_vec), squared=False))

ridge best_para {'alpha': 34.21052631578948}
ridge best_score 0.0008779481566552505
ridge_r2_score_train= 0.043856569489621244
ridge_r2_score_check= 0.0009199580588070688
rmse= 0.679702302309662


In [30]:
test_df

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [31]:
x_test = test_df['full_text']
x_test_vec = vec.transform(x_test)
x_test_vec

<3x4583 sparse matrix of type '<class 'numpy.float64'>'
	with 425 stored elements in Compressed Sparse Row format>

In [32]:
pred_1 = ridge_model1.predict(x_test_vec)
test_df['cohesion'] = pred_1

In [33]:
pred_2 = ridge_model2.predict(x_test_vec)
test_df['syntax'] = pred_2

In [34]:
pred_3 = ridge_model3.predict(x_test_vec)
test_df['vocabulary'] = pred_3

In [35]:
pred_4 = ridge_model4.predict(x_test_vec)
test_df['phraseology'] = pred_4

In [36]:
pred_5 = ridge_model5.predict(x_test_vec)
test_df['grammar'] = pred_5

In [37]:
pred_6 = ridge_model6.predict(x_test_vec)
test_df['conventions'] = pred_6

In [38]:
test_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,when a person has no experience on a job their...,2.977096,3.093063,3.287507,3.167188,3.104038,3.174666
1,000BAD50D026,Do you think students would benefit from being...,2.918421,3.009507,3.223502,3.105798,3.01292,3.060604
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",3.458634,3.036429,3.2366,3.114464,3.043099,3.048575


In [39]:
test_df = test_df.drop(['full_text'], axis=1)

In [40]:
test_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.977096,3.093063,3.287507,3.167188,3.104038,3.174666
1,000BAD50D026,2.918421,3.009507,3.223502,3.105798,3.01292,3.060604
2,00367BB2546B,3.458634,3.036429,3.2366,3.114464,3.043099,3.048575


In [41]:
test_df.to_csv('submission.csv', index=False)