In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
sns.set(style="whitegrid", palette=palette)
sns.color_palette(palette)

# palette2 = sns.diverging_palette(240, -240, n=10)
# sns.color_palette(palette2)

- **id** - unique ID for excerpt
- **url_legal** - URL of source - this is blank in the test set.
- **license** - license of source material - this is blank in the test set.
- **excerpt** - text to predict reading ease of
- **target** - reading ease
- **standard_error** - measure of spread of scores among multiple raters for each excerpt. Not included for test data.

In [None]:
# sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

# def custom_palette(custom_colors):
#     customPalette = sns.set_palette(sns.color_palette(custom_colors))
#     sns.palplot(sns.color_palette(custom_colors), size=0.8)
#     plt.tick_params(axis='both', labelsize=0, length = 0)

# palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
# palette2 = sns.diverging_palette(120, 220, n=20)
# custom_palette(palette)

In [None]:
train_data = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_data = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
print(train_data.shape)
train_data.head()

In [None]:
msno.bar(train_data, color=palette[1], sort="ascending", figsize=(10,5), fontsize=12)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,6))
sns.kdeplot(ax=ax[0], data=train_data, x="target", shade=True)
sns.kdeplot(ax=ax[1], data=train_data, x='standard_error', shade=True, color=palette[1]);

In [None]:
plt.figure(figsize=(16, 8))
sns.countplot(data=train_data, y="license", color=palette[1], order = train_data['license'].value_counts().index)
plt.title("License Distribution");

# Data Cleaning

In [None]:
# a=train_data[train_data['excerpt'].str.contains('/')].reset_index()
# print(a['excerpt'][0])

print("<: ", train_data['excerpt'].str.contains('<').sum())
print(">: ", train_data['excerpt'].str.contains('>').sum())
print("/: ", train_data['excerpt'].str.contains('/').sum())
print("http: ", train_data['excerpt'].str.contains('http').sum())
print("<br />: ", train_data['excerpt'].str.contains('<br />').sum())

In [None]:
!pip install contractions

In [None]:
import contractions
import re
import string
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

from nltk.stem.wordnet import WordNetLemmatizer
wnl = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def remove_punct(text):
    message=[]
    
    for word in text:
        message_not_punc = []
        
        if word not in stop_words:
            for char in word:
                if char not in string.punctuation:
                    message_not_punc.append(char)

            text_nopunct = "".join(message_not_punc)
            
            if text_nopunct!="":
                message.append(text_nopunct)
                
    return message

def preprocessing(text):
    text = text.lower().strip()
    text = text.replace("/"," ")
    text = contractions.fix(text)
    text=word_tokenize(text)
    
    message = []
    
    for word in text:
        message.append(wnl.lemmatize(word, get_wordnet_pos(word)))
    
    message = remove_punct(message)
    message = " ".join(message)
    
    return message


train_data['excerpt_clean'] = train_data['excerpt'].apply(lambda x: preprocessing(x))
test_data['excerpt_clean'] = test_data['excerpt'].apply(lambda x: preprocessing(x))

train_data.head()

# n-grams

In [None]:
from collections import Counter
from nltk import ngrams

text = ' '.join(train_data['excerpt_clean'].tolist())
text_tokenize = word_tokenize(text)

def most_common_words(text, n, num, title):
    most_common = dict(Counter(ngrams(text, n)).most_common()[:num])
    df = pd.DataFrame.from_dict(most_common, orient='index').reset_index()
    df = df.rename(columns={'index':'Word', 0:'Count'})
    df['Word'] = df['Word'].apply(lambda x: ' '.join(x))

    fig = plt.figure(figsize = (20,6))
    sns.barplot(data=df, x="Count", y="Word", color=palette[1], orient='h')
    plt.title(title)
    plt.xlabel('Frequency')
    plt.ylabel('')

In [None]:
most_common_words(text_tokenize, 1, 20, 'UniGram')

In [None]:
most_common_words(text_tokenize, 2, 20, 'BiGram')

In [None]:
most_common_words(text_tokenize, 3, 20, 'TriGram')

# Create features

## Sentence length

In [None]:
train_data['length'] = train_data['excerpt'].apply(lambda x: len(x) - x.count(" "))
test_data['length'] = test_data['excerpt'].apply(lambda x: len(x) - x.count(" "))

sns.displot(data=train_data, x="length", bins=20, aspect=1.5)

# Count sentence

In [None]:
from nltk.tokenize import sent_tokenize

train_data['count_sent'] = train_data['excerpt'].apply(lambda x: len(sent_tokenize(x)))
test_data['count_sent'] = test_data['excerpt'].apply(lambda x: len(sent_tokenize(x)))

sns.displot(data=train_data, x="count_sent", bins=20, aspect=1.5)

## Average sentence length

In [None]:
def avg_sent_len(text):  
    text=sent_tokenize(text)
    
    for i, val in enumerate(text):
        text[i]=len(val)
    
    return round(np.mean(text), 3)

train_data['avg_sent_len'] = train_data['excerpt'].apply(lambda x: avg_sent_len(x))
test_data['avg_sent_len'] = test_data['excerpt'].apply(lambda x: avg_sent_len(x))

sns.displot(data=train_data, x="avg_sent_len", bins=20, aspect=1.5)

## Sentence syllables

In [None]:
!pip install syllables

In [None]:
import syllables

train_data['count_sent_syll'] = train_data['excerpt'].apply(lambda x: syllables.estimate(x))
test_data['count_sent_syll'] = test_data['excerpt'].apply(lambda x: syllables.estimate(x))

sns.displot(data=train_data, x="count_sent_syll", bins=20, aspect=1.5)

## Count of punctuations marks

In [None]:
def count_punct(text):
    count=0
    
    for char in text:
        if char in string.punctuation:
            count+=1
    
    return count

train_data['count_punct'] = train_data['excerpt'].apply(lambda x: count_punct(x))
test_data['count_punct'] = test_data['excerpt'].apply(lambda x: count_punct(x))

sns.displot(data=train_data, x="count_punct", bins=20, aspect=1.5)

## Count uppercase letters

In [None]:
def count_uppercase(text):  
    count=0
    
    for char in text:
        if char.isupper():
            count+=1
    
    return count

train_data['count_uppercase'] = train_data['excerpt'].apply(lambda x: count_uppercase(x))
test_data['count_uppercase'] = test_data['excerpt'].apply(lambda x: count_uppercase(x))

sns.displot(data=train_data, x="count_uppercase", bins=20, aspect=1.5)

## Count exclamation marks

In [None]:
train_data['exclamation_marks'] = train_data['excerpt'].apply(lambda x: x.count("!"))
test_data['exclamation_marks'] = test_data['excerpt'].apply(lambda x: x.count("!"))

sns.displot(data=train_data, x="exclamation_marks", bins=20, aspect=1.5, color=palette[4])

## Count words

In [None]:
train_data['count_words'] = train_data['excerpt'].apply(lambda x: len(word_tokenize(x)))
test_data['count_words'] = test_data['excerpt'].apply(lambda x: len(word_tokenize(x)))

sns.displot(data=train_data, x="count_words", bins=20, aspect=1.5)

## Word length

In [None]:
def word_len(text): 
    message_not_punc = []
    
    for char in text:
        if char not in string.punctuation:
            message_not_punc.append(char)
            
    text_nopunct = "".join(message_not_punc)
    
    words_length = {}
    words_list = word_tokenize(text_nopunct)
    
    for word in words_list:
        if len(word) in words_length:
            words_length[len(word)] += 1
        else:
            words_length[len(word)] = 1
        
    
    words_length = dict(sorted(words_length.items()))
    
    return words_length

words_length_train = train_data['excerpt'].apply(lambda x: word_len(x.lower()))
for i in range(1,31):
    if i not in words_length_train[0]:
        words_length_train[0][i]=0
        
words_length_df_train = pd.DataFrame.from_records(words_length_train)
cols_train = words_length_df_train.columns.tolist()
words_length_df_train = words_length_df_train[sorted(cols_train)].fillna(0).astype(int)

words_length_test = test_data['excerpt'].apply(lambda x: word_len(x.lower()))
for i in range(1,31):
    if i not in words_length_test[0]:
        words_length_test[0][i]=0
        
words_length_df_test = pd.DataFrame.from_records(words_length_test)
cols_test = words_length_df_test.columns.tolist()
words_length_df_test = words_length_df_test[sorted(cols_test)].fillna(0).astype(int)
# X_len = pd.concat([train_data, words_length_df], axis=1)
# X_len

data_words_length=word_len(text.lower())
keys = data_words_length.keys()
vals = data_words_length.values()

fig = plt.figure(figsize = (20,6))
plt.bar(keys, vals, align='center')

## Average word length

In [None]:
def avg_word_len(text):  
    text=word_tokenize(text)
    
    for i, val in enumerate(text):
        text[i]=len(val)
    
    return round(np.mean(text), 3)

train_data['avg_word_len'] = train_data['excerpt'].apply(lambda x: avg_word_len(x))
test_data['avg_word_len'] = test_data['excerpt'].apply(lambda x: avg_word_len(x))

sns.displot(data=train_data, x="avg_word_len", bins=20, aspect=1.5)

## Average syllables by word

In [None]:
def avg_syll_word(text):  
    text=word_tokenize(text)
    
    for i, val in enumerate(text):
        text[i]=syllables.estimate(val)
    
    return round(np.mean(text), 3)

train_data['avg_syll_len'] = train_data['excerpt'].apply(lambda x: avg_syll_word(x))
test_data['avg_syll_len'] = test_data['excerpt'].apply(lambda x: avg_syll_word(x))

sns.displot(data=train_data, x="avg_syll_len", bins=20, aspect=1.5)

## Count unique words

In [None]:
# train_data['unique_word_count'] = train_data['excerpt'].apply(lambda x: len(pd.unique(x.lower().split())))
train_data['unique_word_count_clean'] = train_data['excerpt_clean'].apply(lambda x: len(pd.unique(word_tokenize(x.lower()))))
test_data['unique_word_count_clean'] = test_data['excerpt_clean'].apply(lambda x: len(pd.unique(word_tokenize(x.lower()))))

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="unique_word_count", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='unique_word_count_clean', bins=20, color=palette[1]);

In [None]:
!pip install textstat
!pip install py-readability-metrics

In [None]:
import textstat
from readability import Readability
# test_data=train_data['excerpt'][0]
# r = Readability(test_data)

# Flesch Reading Ease
# train_data['flesch_reading_ease1'] = train_data['excerpt'].apply(lambda x: textstat.flesch_reading_ease(x))
train_data['flesch_reading_ease2'] = train_data['excerpt'].apply(lambda x: Readability(x).flesch().score)
test_data['flesch_reading_ease2'] = test_data['excerpt'].apply(lambda x: Readability(x).flesch().score)
# print('flesch_reading_ease ',textstat.flesch_reading_ease(test_data))
# print('flesch ',r.flesch().score)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="flesch_reading_ease1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='flesch_reading_ease2', bins=20, color=palette[1]);


# Flesch-Kincaid Grade Level
# train_data['flesch_kincaid_grade1'] = train_data['excerpt'].apply(lambda x: textstat.flesch_kincaid_grade(x))
train_data['flesch_kincaid_grade2'] = train_data['excerpt'].apply(lambda x: Readability(x).flesch_kincaid().score)
test_data['flesch_kincaid_grade2'] = test_data['excerpt'].apply(lambda x: Readability(x).flesch_kincaid().score)
# print('flesch_kincaid_grade ',textstat.flesch_kincaid_grade(test_data))
# print('flesch_kincaid ',r.flesch_kincaid().score)
# print(fk.score)
# print(fk.grade_level)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="flesch_kincaid_grade1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='flesch_kincaid_grade2', bins=20, color=palette[1]);


# SMOG
train_data['smog_index'] = train_data['excerpt'].apply(lambda x: textstat.smog_index(x))
test_data['smog_index'] = test_data['excerpt'].apply(lambda x: textstat.smog_index(x))
# print('smog_index ',textstat.smog_index(test_data))
# print('smog ',r.smog(all_sentences=True))

# sns.displot(data=train_data, x="smog_index", bins=20, aspect=1.5)


# Coleman Liau Index
# train_data['coleman_liau_index1'] = train_data['excerpt'].apply(lambda x: textstat.coleman_liau_index(x))
train_data['coleman_liau_index2'] = train_data['excerpt'].apply(lambda x: Readability(x).coleman_liau().score)
test_data['coleman_liau_index2'] = test_data['excerpt'].apply(lambda x: Readability(x).coleman_liau().score)
# print('coleman_liau_index ',textstat.coleman_liau_index(test_data))
# print('coleman_liau ',r.coleman_liau().score)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="coleman_liau_index1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='coleman_liau_index2', bins=20, color=palette[1]);


# Automated Readability Index (ARI)
# train_data['automated_readability_index1'] = train_data['excerpt'].apply(lambda x: textstat.automated_readability_index(x))
train_data['automated_readability_index2'] = train_data['excerpt'].apply(lambda x: Readability(x).ari().score)
test_data['automated_readability_index2'] = test_data['excerpt'].apply(lambda x: Readability(x).ari().score)
# print('automated_readability_index ',textstat.automated_readability_index(test_data))
# print('ari ',r.ari().score)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="automated_readability_index1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='automated_readability_index2', bins=20, color=palette[1]);


# Dale Chall Readability
# train_data['dale_chall_readability_score1'] = train_data['excerpt'].apply(lambda x: textstat.dale_chall_readability_score(x))
train_data['dale_chall_readability_score2'] = train_data['excerpt'].apply(lambda x: Readability(x).dale_chall().score)
test_data['dale_chall_readability_score2'] = test_data['excerpt'].apply(lambda x: Readability(x).dale_chall().score)
# print('dale_chall_readability_score ',textstat.dale_chall_readability_score(test_data))
# print('dale_chall ',r.dale_chall().score)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="dale_chall_readability_score1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='dale_chall_readability_score2', bins=20, color=palette[1]);


# Linsear Write
# train_data['linsear_write_formula1'] = train_data['excerpt'].apply(lambda x: textstat.linsear_write_formula(x))
train_data['linsear_write_formula2'] = train_data['excerpt'].apply(lambda x: Readability(x).linsear_write().score)
test_data['linsear_write_formula2'] = test_data['excerpt'].apply(lambda x: Readability(x).linsear_write().score)
# print('linsear_write_formula ',textstat.linsear_write_formula(test_data))
# print('linsear_write ',r.linsear_write().score)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="linsear_write_formula1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='linsear_write_formula2', bins=20, color=palette[1]);


# Gunning Fog
# train_data['gunning_fog1'] = train_data['excerpt'].apply(lambda x: textstat.gunning_fog(x))
train_data['gunning_fog2'] = train_data['excerpt'].apply(lambda x: Readability(x).gunning_fog().score)
test_data['gunning_fog2'] = test_data['excerpt'].apply(lambda x: Readability(x).gunning_fog().score)
# print('gunning_fog ',textstat.gunning_fog(test_data))
# print('gunning_fog ',r.gunning_fog().score)

# fig, ax = plt.subplots(1,2,figsize=(20,6))
# sns.histplot(ax=ax[0], data=train_data, x="gunning_fog1", bins=20)
# sns.histplot(ax=ax[1], data=train_data, x='gunning_fog2', bins=20, color=palette[1]);


# SPACHE
train_data['spache'] = train_data['excerpt'].apply(lambda x: Readability(x).spache().score)
test_data['spache'] = test_data['excerpt'].apply(lambda x: Readability(x).spache().score)
# print('spache',r.spache().score)

# sns.displot(data=train_data, x="spache", bins=20, aspect=1.5)


# Syllable Count
# print('syllable_count ',textstat.syllable_count(test_data))

# Lexicon Count
# print('lexicon_count ',textstat.lexicon_count(test_data, removepunct=True))

# Sentence Count
# print('sentence_count ',textstat.sentence_count(test_data))

# Difficult words
train_data['difficult_words'] = train_data['excerpt'].apply(lambda x: textstat.difficult_words(x))
test_data['difficult_words'] = test_data['excerpt'].apply(lambda x: textstat.difficult_words(x))
# print('difficult_words ',textstat.difficult_words(test_data))

# sns.displot(data=train_data, x="difficult_words", bins=20, aspect=1.5)

## Grammar checker

In [None]:
!pip install language_tool_python

In [None]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

# matches = tool.check(train_data['excerpt'][5])
# print(len(matches))
# print(matches[0])
# print(matches[1])
# print(matches[2])

# correct=tool.correct(train_data['excerpt'][5])
# print(correct)

train_data['gramm_error'] = train_data['excerpt'].apply(lambda x: len(tool.check(x)))
test_data['gramm_error'] = test_data['excerpt'].apply(lambda x: len(tool.check(x)))
sns.displot(data=train_data, x="gramm_error", bins=20, aspect=1.5)

## Lexical richness

In [None]:
!pip install lexicalrichness

In [None]:
from lexicalrichness import LexicalRichness

# lex = LexicalRichness(train_data['excerpt'][5])

# print('word count',lex.words)
# print('unique term count',lex.terms)
# print('type-token ratio',lex.ttr)
# print('root type-token ratio',lex.rttr)
# print('corrected type-token ratio',lex.cttr)
# print('mean segmental type-token ratio',lex.msttr(segment_window=25))
# print('moving average type-token ratio',lex.mattr(window_size=25))
# print('Measure of Textual Lexical Diversity',lex.mtld(threshold=0.72))
# print('hypergeometric distribution diversity',lex.hdd(draws=42))

train_data['ttr'] = train_data['excerpt'].apply(lambda x: LexicalRichness(x).ttr)
test_data['ttr'] = test_data['excerpt'].apply(lambda x: LexicalRichness(x).ttr)
sns.displot(data=train_data, x="ttr", bins=20, aspect=1.5)

train_data['rttr'] = train_data['excerpt'].apply(lambda x: LexicalRichness(x).rttr)
test_data['rttr'] = test_data['excerpt'].apply(lambda x: LexicalRichness(x).rttr)
sns.displot(data=train_data, x="rttr", bins=20, aspect=1.5)

train_data['cttr'] = train_data['excerpt'].apply(lambda x: LexicalRichness(x).cttr)
test_data['cttr'] = test_data['excerpt'].apply(lambda x: LexicalRichness(x).cttr)
sns.displot(data=train_data, x="cttr", bins=20, aspect=1.5)

In [None]:
train_data.head()

In [None]:
corr = train_data.corr(method = "pearson")
# corr = train_data.corr(method = "spearman")
# corr = train_data.corr(method = "kendall")

f, ax = plt.subplots(figsize=(27, 27))

sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True, ax=ax, annot=True);

# Model

In [None]:
train_data.columns

In [None]:
X = train_data[['length', 'count_sent', 'avg_sent_len','count_sent_syll', 'count_punct', 'count_uppercase','exclamation_marks', 
                'count_words', 'avg_word_len', 'avg_syll_len', 'unique_word_count_clean', 'flesch_reading_ease2', 'flesch_kincaid_grade2', 
                'smog_index', 'coleman_liau_index2', 'automated_readability_index2', 'dale_chall_readability_score2', 
                'linsear_write_formula2', 'gunning_fog2', 'spache', 'difficult_words', 'gramm_error', 'ttr', 'rttr', 'cttr']]

y = train_data['target']

X_sub = test_data[['length', 'count_sent', 'avg_sent_len','count_sent_syll', 'count_punct', 'count_uppercase','exclamation_marks', 
                   'count_words', 'avg_word_len', 'avg_syll_len', 'unique_word_count_clean', 'flesch_reading_ease2', 'flesch_kincaid_grade2', 
                   'smog_index', 'coleman_liau_index2', 'automated_readability_index2', 'dale_chall_readability_score2', 
                   'linsear_write_formula2', 'gunning_fog2', 'spache', 'difficult_words', 'gramm_error', 'ttr', 'rttr', 'cttr']]

print(X.shape)
print(X_sub.shape)

In [None]:
print(words_length_df_train.shape)
print(words_length_df_test.shape)

X = pd.concat([X, words_length_df_train], axis=1)
X_sub = pd.concat([X_sub, words_length_df_test], axis=1)

print(X.shape)
print(X_sub.shape)

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler

scale=StandardScaler().fit(X_train)
X_train_sc = scale.transform(X_train)
X_test_sc = scale.transform(X_test)
X_sub_sc = scale.transform(X_sub)

print(X_train_sc.shape)
print(X_test_sc.shape)

In [None]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error, mean_poisson_deviance, mean_gamma_deviance
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
models=[("Linear Regression", LinearRegression()),
        ("Ridge Regression", Ridge()),
        ("Lasso Regression", Lasso()),
        ("Elastic-Net Regression", ElasticNet()),
        ("Stochastic Gradient Descent", SGDRegressor()),
        ("Decision Tree", DecisionTreeRegressor()),
        ("Random Forest", RandomForestRegressor()),
        ("Extra Trees", ExtraTreesRegressor()),
        ("Gradient Boostin", GradientBoostingRegressor()),
        ("KNeighbors", KNeighborsRegressor()),
        ("SVM linear", SVR(kernel='linear')),
        ("SVM rbf", SVR(kernel='rbf')),
        ("Ada Boost", AdaBoostRegressor())]

for name, model in models:
    results = cross_val_score(model, X_train_sc, y_train, cv=10)
    print(f"\x1b[96m{name}\x1b[0m: \x1b[93m{results.mean():.4f}\x1b[0m ± {results.std():.4f}")

In [None]:
rdg = Ridge(alpha=0.4)
rdg.fit(X_train_sc, y_train)

rdg_predict = rdg.predict(X_test_sc)
print(mean_squared_error(y_test, rdg_predict, squared=False))
print("max_error: ", max_error(y_test, rdg_predict))

In [None]:
svr = SVR(C=0.6, gamma=0.01, kernel='rbf')
svr.fit(X_train_sc, y_train)

svr_predict = svr.predict(X_test_sc)
print(mean_squared_error(y_test, svr_predict, squared=False))
print("max_error: ", max_error(y_test, svr_predict))

In [None]:
# features = np.arange(1, 50, 1)
# results_test = []

# for feature in features:
#     rf = KNeighborsRegressor(n_neighbors=feature, n_jobs=-1)
#     rf.fit(X_train_sc, y_train)
    
#     results_test.append(mean_squared_error(y_test, rf.predict(X_test_sc), squared=False))

# fig, ax = plt.subplots(figsize=(25,8)) 
# plt.plot(features, results_test, 'b')

# ax.set_axisbelow(True)
# ax.minorticks_on()
# ax.grid(which='major', linestyle='-', linewidth=0.5, color='black',)
# ax.grid(which='minor', linestyle=':', linewidth=0.5, color='black', alpha=0.7)

# plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.1))

# print(results_test[results_test.index(min(results_test))])
# print(features[results_test.index(min(results_test))])

In [None]:
rf = RandomForestRegressor(n_estimators=39, max_features=24, min_samples_split=2, random_state=15, n_jobs=-1)
rf.fit(X_train_sc, y_train)

rf_predict = rf.predict(X_test_sc)
print(mean_squared_error(y_test, rf_predict, squared=False))
print("max_error: ", max_error(y_test, rf_predict))

In [None]:
feature_importance = rf.feature_importances_[:30]
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')

In [None]:
rf_predict = rf.predict(X_sub_sc)

output = pd.DataFrame({'id': test_data['id'], 'target': rf_predict})
output.to_csv('submission.csv', index=False)

In [None]:
gb = GradientBoostingRegressor(n_estimators=71, max_features=23, random_state=0)
gb.fit(X_train_sc, y_train)

gb_predict = gb.predict(X_test_sc)
print(mean_squared_error(y_test, gb_predict, squared=False))
print("max_error: ", max_error(y_test, gb_predict))

In [None]:
feature_importance = gb.feature_importances_[:30]
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig = plt.figure(figsize=(17, 6))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')

In [None]:
kn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1)
kn.fit(X_train_sc, y_train)

kn_predict = kn.predict(X_test_sc)
print(mean_squared_error(y_test, kn_predict, squared=False))
print("max_error: ", max_error(y_test, kn_predict))