# <span style="color:#D81B60"> Classifying YouTube Comments for Advertisement </span>

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,ExtraTreesClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,confusion_matrix,auc


## <span style="color:#D81B60"> Customized Utility Functions </span>

In [2]:
def days_group(day):
    if(day<5):
        return "weekday"
    elif(day>4 and day <7):
        return "weekend"
    else:
        return "unknown_day"
        
def time_group(time):
    if(time==99):
        return "unknown_time"
    elif(time in range(17,21)):
        return "peak"
    else:
        return "normal"


In [3]:
def text_cleaning(content):
    lemm = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    content = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+','', content)               # remove urls 
    content = re.sub("<[^>]*>"," ",content)                                               # remove html tags 
    content = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", "", content)                             # remove Numbers 
    content = re.sub(r'[^\w\s]',' ',content.lower())                                      # remove alphan-numeric characters
    c_list=content.split(" ")
    new_list=[]
    for i in range(len(c_list)):
        if(len(c_list[i])>2):
            word=c_list[i]
            word=stemmer.stem(word)
            new_list.append(lemm.lemmatize(word))
    content=" ".join(new_list)
    return content

In [4]:
def url_presence(content):
    urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', content)
    presence=0
    if("bit.ly" in content):
        presence=1
    if("moneygq.com" in content.lower()):
        presence=1    
    if("zonepa.com" in content.lower()):
        presence=1        
    if(len(urls)>0 and (('http://www.youtube.com' not in urls) and ('https://www.youtube.com' not in urls) and \
                      ('http://youtu.be' not in urls) and ('https://youtu.be' not in urls))):
    #     print(urls)
        presence=1
    return presence

In [5]:
def near_zero_var(df,threshold=0.0):
    cols=df.columns
    near_zero_vars=[]
    for col in cols:
        if(np.var(df[col])<=threshold): # variable has zero variance and doesn't affect the target
            near_zero_vars.append(col)
    
    return(list(set(cols)-set(near_zero_vars)))    

In [6]:
def find_correlation(df, thresh=0.9):
    """
    Given a numeric pd.DataFrame, this will find highly correlated features,
    and return a list of features to remove
    params:
    - df : pd.DataFrame
    - thresh : correlation threshold, will remove one of pairs of features with
               a correlation greater than this value
    """
    
    corrMatrix = df.corr()
    corrMatrix.loc[:,:] =  np.tril(corrMatrix, k=-1)

    already_in = set()
    result = []

    for col in corrMatrix:
        perfect_corr = corrMatrix[col][corrMatrix[col] > thresh].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            result.append(perfect_corr)


    select_nested = [f[1:] for f in result]
    select_flat = [i for j in select_nested for i in j]
    return select_flat

In [7]:
def auc_reduction(df,true_labels,threshold=0.0):
    res_features=[]
    for col in df.columns:
        if(auc(df[col],true_labels,reorder=True)>=threshold):
            res_features.append(col)
    return res_features

In [8]:
def train_test_model(model,model_name="random_forest"):
    print("Training Model : ######################## ",model_name," ############################### ")
    model.fit(train_x,train_y)
    train_pred=model.predict(train_x)
    
    print("Validate Model : ######################## ",model_name," ############################### ")
    valid_pred=model.predict(valid_x)
    print()
    
    print("Training Error : ")
    print(roc_auc_score(train_y,train_pred))
    print()
    
    print("Validation Error : ")
    print(roc_auc_score(valid_y,valid_pred))
    print()
    
    print("Training Confusion Matrix :")
    print(confusion_matrix(train_y,train_pred))
    print()
    
    print("Validation Confusion Matrix :")
    print(confusion_matrix(valid_y,valid_pred))
    print()
    
#     print("Test Predictions :")
    if(model_name=="vooting_classifier"):
        pred_test=model.predict(final_test_df)
        response=pd.DataFrame({
            "ID":test_ids,
            "CLASS":pred_test
        })[["ID","CLASS"]]
        response.to_csv("./result/"+model_name+"_result.csv",index=False)
        return model
    else:
#         pred_test=model.predict_proba(final_test_df)
        pred_test=model.predict(final_test_df)
        response=pd.DataFrame({
            "ID":test_ids,
#             "CLASS":pred_test[:,1]
            "CLASS":pred_test
        })[["ID","CLASS"]]
        response.to_csv("./result/"+model_name+"_result.csv",index=False)
    
    
    features=final_train_df.columns
#     features.extend(['url_presence','len_review'])
    
    if((model_name!="logistic_regresssion") and (model_name!="SVC")):
        feature_importance=pd.DataFrame({
            "features":features,
            "importance":model.feature_importances_*100
        })[["features","importance"]]
        feature_importance.to_csv("./result/"+model_name+"_importance.csv",index=False)
        print("Top 10 features from %s model " %model_name)
        print(feature_importance[feature_importance["importance"]>0].sort_values("importance",ascending=False).head(10))
    else:
        logistic_regression_coefficient=pd.DataFrame({
            "features":features,
            "coefficient":model.coef_[0][:]
        })[["features","coefficient"]]
        print("Regression Coefficient from %s model " %model_name)
        print(logistic_regression_coefficient)
#         print(model.coef_[0][:])
    
    return model

### <span style="color:#F57C00"> Load Data </span>

In [9]:
data_dir="./Data Sets/"
train_data=pd.read_csv(data_dir+"/train.csv")
test_data=pd.read_csv(data_dir+"/test.csv")
print("Train data size :",train_data.shape[0])
print("Test data size :",test_data.shape[0])

Train data size : 1157
Test data size : 799


In [10]:
test_ids=test_data.ID
test_data.drop("ID",axis=1,inplace=True)
true_labels=train_data.CLASS
train_data.drop("CLASS",axis=1,inplace=True)
combine_data=train_data.append(test_data,ignore_index=True)
combine_data.drop("COMMENT_ID",inplace=True,axis=1)
combine_data.DATE=pd.to_datetime(combine_data.DATE)

In [11]:
combine_data.head()

Unnamed: 0,AUTHOR,DATE,CONTENT
0,Julius NM,2013-11-07 06:20:48,"Huh, anyway check out this you[tube] channel: ..."
1,ElNino Melendez,2013-11-09 08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿
2,GsMega,2013-11-10 16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿
3,ferleck ferles,2013-11-27 21:39:24,Subscribe to my channel ﻿
4,BeBe Burkey,2013-11-28 16:30:13,and u should.d check my channel and tell me wh...


### <span style="color:#F57C00"> Summary of Null Values present in the Data </span>

In [12]:
print("Percentage of null values present in the Train Data : \n")
print((train_data.isnull().sum()/train_data.shape[0])*100)

Percentage of null values present in the Train Data : 

COMMENT_ID     0.000000
AUTHOR         0.000000
DATE          11.927398
CONTENT        0.000000
dtype: float64


In [13]:
print("Percentage of null values present in the Test Data : \n")
print((test_data.isnull().sum()/test_data.shape[0])*100)

Percentage of null values present in the Test Data : 

COMMENT_ID     0.00000
AUTHOR         0.00000
DATE          13.39174
CONTENT        0.00000
dtype: float64


In [14]:
print("Distribution of Adds : %s and Comments : %s " %tuple(true_labels.value_counts().values))

Distribution of Adds : 586 and Comments : 571 


## <span style="color:#D81B60"> Feature Engineering  </span>

### <span style="color:#F57C00"> Create Time Based Features </span>

In [15]:
combine_data["week_day"]=combine_data.DATE.apply(lambda x:x.dayofweek)
combine_data["time_day"]=combine_data.DATE.apply(lambda x:x.hour)
combine_data["month"]=combine_data.DATE.apply(lambda x:x.month)
combine_data["year"]=combine_data.DATE.apply(lambda x:x.year)

In [16]:
combine_data.head()

Unnamed: 0,AUTHOR,DATE,CONTENT,week_day,time_day,month,year
0,Julius NM,2013-11-07 06:20:48,"Huh, anyway check out this you[tube] channel: ...",3.0,6.0,11.0,2013.0
1,ElNino Melendez,2013-11-09 08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,5.0,8.0,11.0,2013.0
2,GsMega,2013-11-10 16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,6.0,16.0,11.0,2013.0
3,ferleck ferles,2013-11-27 21:39:24,Subscribe to my channel ﻿,2.0,21.0,11.0,2013.0
4,BeBe Burkey,2013-11-28 16:30:13,and u should.d check my channel and tell me wh...,3.0,16.0,11.0,2013.0


### <span style="color:#F57C00"> Impute value for time based Features </span>

In [17]:
for col in combine_data.columns[-4:]:
    combine_data.fillna(value={col:99},inplace=True)

In [18]:
combine_data.drop("DATE",axis=1,inplace=True)
combine_data.head()

Unnamed: 0,AUTHOR,CONTENT,week_day,time_day,month,year
0,Julius NM,"Huh, anyway check out this you[tube] channel: ...",3.0,6.0,11.0,2013.0
1,ElNino Melendez,me shaking my sexy ass on my channel enjoy ^_^ ﻿,5.0,8.0,11.0,2013.0
2,GsMega,watch?v=vtaRGgvGtWQ Check this out .﻿,6.0,16.0,11.0,2013.0
3,ferleck ferles,Subscribe to my channel ﻿,2.0,21.0,11.0,2013.0
4,BeBe Burkey,and u should.d check my channel and tell me wh...,3.0,16.0,11.0,2013.0


In [19]:
combine_data["traffic_group"]=combine_data.time_day.apply(time_group)
combine_data["day_group"]=combine_data.week_day.apply(days_group)
combine_data.head()

Unnamed: 0,AUTHOR,CONTENT,week_day,time_day,month,year,traffic_group,day_group
0,Julius NM,"Huh, anyway check out this you[tube] channel: ...",3.0,6.0,11.0,2013.0,normal,weekday
1,ElNino Melendez,me shaking my sexy ass on my channel enjoy ^_^ ﻿,5.0,8.0,11.0,2013.0,normal,weekend
2,GsMega,watch?v=vtaRGgvGtWQ Check this out .﻿,6.0,16.0,11.0,2013.0,normal,weekend
3,ferleck ferles,Subscribe to my channel ﻿,2.0,21.0,11.0,2013.0,normal,weekday
4,BeBe Burkey,and u should.d check my channel and tell me wh...,3.0,16.0,11.0,2013.0,normal,weekday


In [20]:
time_df=pd.get_dummies(combine_data[["traffic_group","day_group"]],drop_first=True)

In [21]:
combine_data["url_presence"]=combine_data["CONTENT"].apply(url_presence)
combine_data["url_presence"][:1157].sum()

124

In [22]:
combine_data["cleaned_content"]=combine_data.CONTENT.apply(text_cleaning)

### <span style="color:#F57C00"> Creating Content Based Features </span>

**********************************************************
    a) Length of comment
    b) Presence of external url in comment
    c) tf-idf based N-grams(1,3) 
**********************************************************    

In [23]:
combine_data["len_review"]=combine_data.cleaned_content.apply(lambda x:len(x.split(" ")))

In [24]:
combine_data.head()

Unnamed: 0,AUTHOR,CONTENT,week_day,time_day,month,year,traffic_group,day_group,url_presence,cleaned_content,len_review
0,Julius NM,"Huh, anyway check out this you[tube] channel: ...",3.0,6.0,11.0,2013.0,normal,weekday,0,huh anyway check out this you tube channel kob...,9
1,ElNino Melendez,me shaking my sexy ass on my channel enjoy ^_^ ﻿,5.0,8.0,11.0,2013.0,normal,weekend,0,shake sexi as channel enjoy,5
2,GsMega,watch?v=vtaRGgvGtWQ Check this out .﻿,6.0,16.0,11.0,2013.0,normal,weekend,0,watch vtarggvgtwq check this out,5
3,ferleck ferles,Subscribe to my channel ﻿,2.0,21.0,11.0,2013.0,normal,weekday,0,subscrib channel,2
4,BeBe Burkey,and u should.d check my channel and tell me wh...,3.0,16.0,11.0,2013.0,normal,weekday,0,and should check channel and tell what should ...,9


In [25]:
tf_clf=TfidfVectorizer(ngram_range=(1,3),max_features=200,norm ='l2',stop_words="english")
tfidf_data=tf_clf.fit_transform(combine_data.cleaned_content).toarray()

In [26]:
tfidf_df=pd.DataFrame(data=tfidf_data,columns=tf_clf.get_feature_names())

In [27]:
final_df=pd.concat([tfidf_df,time_df],axis=1)

In [28]:
train_df=final_df[:train_data.shape[0]]
test_df=final_df[train_data.shape[0]:]
print(train_df.shape)
print(test_df.shape)

(1157, 204)
(799, 204)


In [29]:
# df=pd.DataFrame(data=train_df,columns=tf_clf.get_feature_names())
df=train_df

## <span style="color:#D81B60"> Feature Reduction Techniques </span>

**********************************
    a) nearzerovariance
    b) multicollinerity 
    c) auc reduction
**********************************

In [30]:
auc_red_features=auc_reduction(df,true_labels=true_labels,threshold=0.1)
df=df[auc_red_features]

In [31]:
threshold=0.0
non_near_zero_vars=near_zero_var(df,threshold=threshold)
print("Number of Variables having variance < %s is %s out of %s"%(threshold,len(non_near_zero_vars),len(df.columns)))

Number of Variables having variance < 0.0 is 204 out of 204


In [32]:
correlated_features=find_correlation(df[non_near_zero_vars],thresh=0.95)
final_features=list(set(non_near_zero_vars)-set(correlated_features))
print(len(final_features))

176


In [33]:
final_train_df=train_df[final_features]
final_test_df=test_df[final_features]
final_train_df["url_presence"]=combine_data["url_presence"][:train_data.shape[0]].values
final_train_df["len_review"]=combine_data["len_review"][:train_data.shape[0]].values
final_test_df["url_presence"]=combine_data["url_presence"][train_data.shape[0]:].values
final_test_df["len_review"]=combine_data["len_review"][train_data.shape[0]:].values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [34]:
print(final_train_df.shape)
print(final_test_df.shape)

(1157, 178)
(799, 178)


## <span style="color:#D81B60"> Split Data into Train and Validation  </span>

In [35]:
train_x,valid_x,train_y,valid_y=train_test_split(final_train_df,true_labels,test_size=0.31,random_state=2018)

In [36]:
print(train_x.shape)
print(valid_x.shape)
print(final_test_df.shape)

(798, 178)
(359, 178)
(799, 178)


## <span style="color:#D81B60"> Execute the Classifications Models </span>

***********************************************************************
    a) Random Forest 
    b) Extra Tree Classifier
    c) Gradient Boosting Machine
    d) Logistic Regression
    e) Support Vector Machine
    f) Vooting Classifier(soft)
***********************************************************************

In [72]:
rf_clf=RandomForestClassifier(random_state=2018,criterion='entropy',min_impurity_decrease=10e-5,max_depth=25,n_estimators=100,oob_score=True,bootstrap=True)
train_test_model(rf_clf,"Random_Forest")

Training Model : ########################  Random_Forest  ############################### 
Validate Model : ########################  Random_Forest  ############################### 

Training Error : 
0.9876543209876543

Validation Error : 
0.9667111552548264

Training Confusion Matrix :
[[393   0]
 [ 10 395]]

Validation Confusion Matrix :
[[175   3]
 [  9 172]]

Top 10 features from Random_Forest model 
                       features  importance
176                url_presence   10.826242
37                     subscrib   10.585461
129                       check   10.134094
124  traffic_group_unknown_time    6.214973
177                  len_review    5.913495
68                         plea    4.489510
90                      channel    4.121508
63                       youtub    3.443827
133                        song    1.884689
42                         view    1.844715


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0001, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=2018, verbose=0, warm_start=False)

In [75]:
ex_clf=ExtraTreesClassifier(random_state=2018,criterion='entropy',min_impurity_decrease=10e-5,max_depth=25,n_estimators=100,oob_score=True,bootstrap=True)
train_test_model(ex_clf,"Extra_Classifier")

Training Model : ########################  Extra_Classifier  ############################### 
Validate Model : ########################  Extra_Classifier  ############################### 

Training Error : 
0.9814814814814814

Validation Error : 
0.969473586194053

Training Confusion Matrix :
[[393   0]
 [ 15 390]]

Validation Confusion Matrix :
[[175   3]
 [  8 173]]

Top 10 features from Extra_Classifier model 
                       features  importance
176                url_presence   12.414360
124  traffic_group_unknown_time    9.597071
37                     subscrib    8.646090
129                       check    7.630790
68                         plea    4.193055
63                       youtub    3.367772
90                      channel    2.840946
152                video youtub    2.324157
177                  len_review    2.292125
128                 check video    2.245106


ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=25, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0001, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=True, random_state=2018, verbose=0, warm_start=False)

In [55]:
gbm_clf=GradientBoostingClassifier(random_state=21,learning_rate=0.001,n_estimators=500,
                                   min_impurity_decrease=10e-5,max_depth=15,max_features="sqrt",loss="exponential")
train_test_model(gbm_clf,"GBM")

Training Model : ########################  GBM  ############################### 
Validate Model : ########################  GBM  ############################### 

Training Error : 
0.9851851851851852

Validation Error : 
0.9639487243156

Training Confusion Matrix :
[[393   0]
 [ 12 393]]

Validation Confusion Matrix :
[[175   3]
 [ 10 171]]

Top 10 features from GBM model 
                       features  importance
129                       check   12.833853
37                     subscrib   11.868914
176                url_presence   11.798324
124  traffic_group_unknown_time    7.238259
177                  len_review    5.268125
68                         plea    5.106862
90                      channel    4.056572
63                       youtub    3.628187
128                 check video    2.037276
142                       money    1.888610


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.001, loss='exponential', max_depth=15,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0001, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=21, subsample=1.0, verbose=0,
              warm_start=False)

In [56]:
logistic_clf=LogisticRegressionCV(random_state=2018,cv=5,tol=10e-5,max_iter=500)
train_test_model(logistic_clf,"logistic_regresssion")

Training Model : ########################  logistic_regresssion  ############################### 
Validate Model : ########################  logistic_regresssion  ############################### 

Training Error : 
0.9812553011026293

Validation Error : 
0.9609535042522812

Training Confusion Matrix :
[[387   6]
 [  9 396]]

Validation Confusion Matrix :
[[170   8]
 [  6 175]]

Regression Coefficient from logistic_regresssion model 
               features  coefficient
0         check channel     0.765776
1                 thumb     4.857782
2                   tri     0.607753
3                  hate    -1.197718
4                 world     2.395354
5     day_group_weekday    -2.472228
6            make money     1.553114
7                  look     0.172171
8                 peopl    -0.530261
9                   boy    -0.189669
10                watch    -0.698406
11    day_group_weekend    -2.346100
12               shuffl    -1.865070
13                 danc     0.674412
14      

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=500,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=2018,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [57]:
svc_clf=SVC(random_state=2018,max_iter=-1,C=1.5,kernel="linear",probability=True)
train_test_model(svc_clf,"SVC")

Training Model : ########################  SVC  ############################### 
Validate Model : ########################  SVC  ############################### 

Training Error : 
0.9726133257939874

Validation Error : 
0.9693804705444161

Training Confusion Matrix :
[[387   6]
 [ 16 389]]

Validation Confusion Matrix :
[[173   5]
 [  6 175]]

Regression Coefficient from SVC model 
               features  coefficient
0         check channel     0.000000
1                 thumb     2.038494
2                   tri     0.296287
3                  hate    -0.041916
4                 world     0.327264
5     day_group_weekday    -0.777485
6            make money     0.894954
7                  look    -0.175253
8                 peopl     0.444103
9                   boy     0.000000
10                watch     0.038415
11    day_group_weekend    -0.739097
12               shuffl    -0.525723
13                 danc     0.366875
14                video    -0.298314
15               reall

SVC(C=1.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=2018, shrinking=True,
  tol=0.001, verbose=False)

In [76]:
voot_clf = VotingClassifier(voting='soft',weights=[0.9667111552548264,0.969473586194053,0.9667111552548264,
                                                   0.9609535042522812,0.9693804705444161],
                            estimators=[('rf', rf_clf),('ex_clf',ex_clf), ('gbm', gbm_clf),\
                                        ('log_reg', logistic_clf), ('svc', svc_clf)])
train_test_model(voot_clf,"vooting_classifier")

Training Model : ########################  vooting_classifier  ############################### 
Validate Model : ########################  vooting_classifier  ############################### 

Training Error : 
0.9838752238243332

Validation Error : 
0.9749053324228693

Training Confusion Matrix :
[[391   2]
 [ 11 394]]

Validation Confusion Matrix :
[[173   5]
 [  4 177]]



  if diff:
  if diff:
  if diff:


VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0001, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            mi...',
  max_iter=-1, probability=True, random_state=2018, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft',
         weights=[0.9667111552548264, 0.969473586194053, 0.9667111552548264, 0.9609535042522812, 0.9693804705444161])