### Authors : Kedarkumar Golla, Krishnan Hariharan

# <center>  SENTIMENT ANAYLSIS ON CRICKET COMMENTARY

### OBJECTIVE

Measuring the performance of a cricket player using only statistical analysis will not completely describe how well or how bad he performed ball-by-ball. The objective of this project is to measure the performance of a player (batsman) using Sentiment Analysis on ball-by-ball cricket commentary


In [None]:
#Importing required libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import text 
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
import nltk
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from matplotlib.pyplot import figure

### 1. DATA AFTER WEB SCRAPING AND LABELLING

In [None]:
#Reading the web scraped and labelled data

data=pd.read_excel(r"C:\Users\KRISH\Desktop\TXT\All_Matches.xlsx")
pd.set_option('display.max_colwidth', -1)
data.head()

### 2. PRE-PROCESSING THE DATAFRAME

In [None]:
#Creating Class based on the sentiment

data.loc[data["Sentiment"]=="negative","Class"]=0
data.loc[data["Sentiment"]=="positive","Class"]=1
data["Class"]=data["Class"].astype("int")
data.head()

In [None]:
data.loc[data["Runs"]=="W","Runs"]=9

def stringToNumbers(Runs):
    if str(Runs).isdigit():
        return Runs
    else:
        for i in Runs:
            if i.isdigit():
                return int(i)

data["Runs"]=data["Runs"].apply(lambda x:stringToNumbers(x))
data.loc[data["Runs"]==9,"Runs"]=-18

In [None]:
data.to_excel("Full_data_Preprocessed_pandas.xlsx")

In [None]:
data["Class"].value_counts().plot(kind="bar")

In [None]:
data.head()

### 3. PRE-PROCESSING THE COMMENTARY TEXTS

#### a) Removing Names from the commentaries using POS Tagging

In [None]:
def removing_names(sentence):
    tagged_sentence = nltk.tag.pos_tag(sentence.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    edited_sentence = ' '.join(word for word in edited_sentence)
    return edited_sentence

data["edited_Commentry"]=data["Commentary"].apply(lambda x:removing_names(x))

#### b) Creating problem sepcific Stop-words

In [None]:
all_stopwords = stopwords.words('english')

important_list=['do', 'does', 'did', 'doing','above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then','once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very','can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
for i in important_list:
    all_stopwords.remove(i)


#### c) Count Vectorizer

In [None]:
#Tokenize using Count Vectorizer with Unigrams (n-gram size = 1)

count_vec_unigram = CountVectorizer(binary=True, stop_words=all_stopwords, ngram_range=(1,1),lowercase=False,min_df=0.01)
count_vec_unigram.fit(data.edited_Commentry)
small_transformed_new = count_vec_unigram.transform(data.edited_Commentry)
print(DataFrame(small_transformed_new.A, columns=count_vec_unigram.get_feature_names()).to_string())

In [None]:
#Tokenize using Count Vectorizer with Bigrams (n-gram size = 2)

count_vec_bigram = CountVectorizer(binary=True, stop_words=all_stopwords, ngram_range=(2,2),lowercase=False,min_df=0.01)
count_vec_bigram.fit(data.edited_Commentry)
small_transformed_new = count_vec_bigram.transform(data.edited_Commentry)
print(DataFrame(small_transformed_new.A, columns=count_vec_bigram.get_feature_names()).to_string())

In [None]:
#Tokenize using Count Vectorizer with both Unigrams (n-gram size = 1)and Bigrams (n-gram size = 2)

count_vec_bigram = CountVectorizer(binary=True, stop_words=all_stopwords, ngram_range=(1,2),lowercase=False,min_df=0.01)
count_vec_bigram.fit(data.edited_Commentry)
small_transformed_new = count_vec_bigram.transform(data.edited_Commentry)
print(DataFrame(small_transformed_new.A, columns=count_vec_bigram.get_feature_names()).to_string())

#### d) Calculating TF-IDF score

In [None]:
tfidf = TfidfTransformer(use_idf=True)
tfidf.fit(small_transformed_new)
small_tfidfed = tfidf.transform(small_transformed_new)
print(DataFrame(small_tfidfed.A, columns=count_vec_bigram.get_feature_names()).to_string())

### 4. MODEL BUILDING FOR CLASSIFICATION

In [None]:
#Logstic Regression Model

lr = LogisticRegression(penalty='l2', C=.8, random_state=21,class_weight='balanced')

In [None]:
#Creating the Pipeline

text_classifier = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, stop_words=all_stopwords, ngram_range=(1,2),lowercase=False,min_df=0.01)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', lr),
#     ('clf', SGDClassifier()), #by default, this is SVM
])

In [None]:
data.head()

In [None]:
#Train-Test-Split

X_train, X_test, y_train, y_test = train_test_split(
     data.edited_Commentry, data.Class, test_size=0.35, random_state=4)

In [None]:
text_classifier.fit(X_train, y_train)

In [None]:
print(X_test[0:4])
print(y_test[0:4])

In [None]:
X_test = X_test.reset_index(drop = True) # drop=True discards the old index
X_test[0:4]

y_test = y_test.reset_index(drop = True)
y_test[0:4]

In [None]:
predicted_test = text_classifier.predict(X_test)
predicted_proba_test = text_classifier.predict_proba(X_test)

for i in range(100):
    print("{}, {}, {}, {}".format(X_test[i], predicted_test[i], predicted_proba_test[i], y_test[i]))
    print(predicted_proba_test[i])

### 5. MODEL EVALUATION

#### a) F1 SCORE and CONFUSION MATRIX

In [None]:
predicted_train = text_classifier.predict(X_train)

y_train = y_train.astype('category')
print(metrics.classification_report(y_train, predicted_train,
    labels=y_train.cat.categories.tolist()))

print(metrics.confusion_matrix(y_train, predicted_train))

predicted_test = text_classifier.predict(X_test)

y_test = y_test.astype('category')
print(metrics.classification_report(y_test, predicted_test,
    labels=y_test.cat.categories.tolist()))

metrics.confusion_matrix(y_test, predicted_test)

In [None]:
metrics.precision_score(y_test, predicted_test, average='macro') 
metrics.precision_score(y_test, predicted_test, average='micro') 
metrics.precision_score(y_test, predicted_test, average='weighted') 
metrics.recall_score(y_test, predicted_test, average='micro')

#### b) ROC Curve (No-Skill Prediction Vs Logistic Prediction)

In [None]:
# Generate a No skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]

#Predict probabilities
lr_probs = pd.DataFrame(text_classifier.predict_proba(X_test),columns={0,1})

In [None]:
#Keep probabilities for the positive outcome only
lr_probs = lr_probs.iloc[:, 1]

#Calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)

#Summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

#Calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

#Plot the roc curve for the model

figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

#Axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

#Show the legend
pyplot.legend()

#Show the plot
pyplot.show()

### 6. PLAYER COMPARISON

### Who is the best opener for Indian team for home matches?

###  KL RAHUL vs ROHIT SHARMA vs SHIKHAR DHAWAN

In [None]:
#Creating the whole pipeline

def preprocess(filename,sheetname):
    data=pd.read_excel(filename,sheet_name=sheetname)
    data["edited_Commentry"]=data["Commentary"].apply(lambda x:removing_names(x))
    all_stopwords = stopwords.words('english')

    important_list=['do', 'does', 'did', 'doing','above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then','once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very','can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    for i in important_list:
        all_stopwords.remove(i)
    predicted_test = text_classifier.predict(data.edited_Commentry)
    predicted_proba_test = text_classifier.predict_proba(data.edited_Commentry)
    
    
    input_data={"edited_Commentry":data.edited_Commentry,"predicted_class":predicted_test}
    final_df=pd.DataFrame(input_data)
    #return len(data.edited_Commentry),len(predicted_test),len(predicted_proba_test)
    counts=final_df["predicted_class"].value_counts()
    
    
    #Create a comparison metric (Sentiment Score) for comparison of both the players
    
    positive_decisions=counts[1]
    negitive_decisions=counts[0]
    total_balls=positive_decisions+negitive_decisions
    sentiment_score_for_the_match=(positive_decisions/negitive_decisions)*total_balls
    return round(sentiment_score_for_the_match,2)

## KL RAHUL

### ANALYSING PERFORMANCE OF KL RAHUL IN LAST 5 INNINGS IN INDIA AS OPENER

In [None]:
Rahul_score_RR1=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR1RA")
Rahul_score_RR2=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR2RA")
Rahul_score_RR3=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR3RA")
Rahul_score_RR4=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR4RA")
Rahul_score_RR5=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR5RA")

In [None]:
# Sentimental Score of KL RAHUL in the last 5 Innings he played in India as Opener

Rahul_scores=[Rahul_score_RR1,Rahul_score_RR2,Rahul_score_RR3,Rahul_score_RR4,Rahul_score_RR5]
print("KL Rahul Scores in the last 5 innings in India  :\n\n",Rahul_scores)
Rahul_mean_score=round(np.mean(Rahul_scores),2)
print("\nMean : ",Rahul_mean_score)
Rahul_median_score=np.median(Rahul_scores)
print("Median : ",Rahul_median_score)
std_Rahul_score=round(np.std(Rahul_scores),2)
print("Standard Deviation : ",std_Rahul_score)
print("Coefficient of Variation : ",std_Rahul_score/Rahul_mean_score)

## ROHIT SHARMA

### ANALYSING PERFORMANCE OF ROHIT SHARMA IN LAST 5 INNINGS IN INDIA AS OPENER

In [None]:
rohit_score_RR1=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR1RS")
rohit_score_RS2=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","RohitRsd2")
rohit_score_RS1=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","RohitRSD1")
rohit_score_RR4=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR4RS")
rohit_score_RR3=preprocess(r"C:\Users\KRISH\Desktop\TXT\RR_ALL.xlsx","RR3RS")

In [None]:
# Sentimental Score of ROHIT SHARMA in the last 5 Innings he played in India as Opener

Rohit_scores=[rohit_score_RR1,rohit_score_RS2,rohit_score_RS1,rohit_score_RR4,rohit_score_RR3]
print("Rohit Sharma Scores in the last 5 innings in India  :\n\n ",Rohit_scores)
Rohit_mean_score=round(np.mean(Rohit_scores),2)
print("\nMean : ",Rohit_mean_score)
Rohit_median_score=np.median(Rohit_scores)
print("Median : ",Rohit_median_score)
std_Rohit_score=round(np.std(Rohit_scores),2)
print("Standard Deviation : ",std_Rohit_score)
print("Coefficient of Variation: ",round((std_Rohit_score/Rohit_mean_score),2))

## SHIKHAR DHAWAN

 ### ANALYSING PERFORMANCE OF SHIKHAR DHWAN IN LAST 5 INNINGS IN INDIA AS OPENER

In [None]:
Shikar_score_RS1=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","ShikarRsd1")
Shikar_score_RS2=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","ShikarRSD2")
Shikar_score_RS3=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","SikharRSD3")
Shikar_score_RS4=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","SIkharRSD4")
Shikar_score_RS5=preprocess(r"C:\Users\KRISH\Desktop\TXT\RS_ALL.xlsx","ShikarRSD5")

In [None]:
# Sentimental Score of SHIKHAR DHAWAN in the last 5 Innings he played in India as Opener

Shikar_scores=[Shikar_score_RS1,Shikar_score_RS2,Shikar_score_RS3,Shikar_score_RS4,Shikar_score_RS5]
print("Shikhar Dhawan Scores in the last 5 innings in India  :\n\n",Shikar_scores)
Shikar_mean_score=round(np.mean(Shikar_scores),2)
print("\nMean : ",Shikar_mean_score)
Shikar_median_score=np.median(Shikar_scores)
print("Median : ",Shikar_median_score)
std_Shikar_score=round(np.std(Shikar_scores),2)
print("Standard Deviation : ",std_Shikar_score)
print("Coefficient of Variation : ",std_Shikar_score/Shikar_mean_score)

### 7. INFERENCE

### Coefficient of Variation of Sentiment  scores of players :
 
### KL RAHUL			 	         : 		   1.06
### SHIKHAR DHAWAN		    : 		  0.74
### ROHIT SHARMA			   :		  0.63

### By comparing the Coefficient of variation of Sentiment  scores of the players, we can infer that ROHIT SHARMA is the best opener among the three and SHIKHAR DHAWAN is a better opener than KL RAHUL for Indian team to play ODI matches in India.
