# Using OCR to Extract Text from Images:

In [None]:
#!pip install numpy opencv-contrib-python
#!pip install pytesseract        #you will need to download tesseract.exe for OCR
#!pip install scikit-learn scikit-image
#!pip install pydirectory
from PIL import Image
from pytesseract import pytesseract
  
# Defining paths to tesseract.exe
# and the image we would be using
path_to_tesseract = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
image_path = r"C:\Users\afaqa\Desktop\Intro to DS\Project\images\image_1.jpg"
  
# Opening the image & storing it in an image object
img = Image.open(image_path)
  
# Providing the tesseract executable
# location to pytesseract library
pytesseract.tesseract_cmd = path_to_tesseract
  
# Passing the image object to image_to_string() function
# This function will extract the text from the image
text = pytesseract.image_to_string(img)
  
# Displaying the extracted text
#print(text)

# Loading Data and Organizing it:

In [145]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import preprocessing
import pickle

df = pd.read_csv("labels.csv")
df = df.iloc[:,1:]     #to drop first column
df.dropna(inplace=True)
df = df.drop(['text_ocr', 'image_name'], axis=1)
df = df.reset_index()
df = df.iloc[:,1:]
df["text_corrected"] = df["text_corrected"].str.lower() 
df["text_corrected"] = df["text_corrected"].str.replace('[^A-Za-z0-9]+',' ', regex = True)

# Applying Feature Extraction on Text: 

In [146]:
from nltk.stem.porter import *
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

def pStem(sentences): 
    corpus = []

    for sentence in sentences:
        sentence = sentence.split()
        ps = PorterStemmer()
        sentence = [ps.stem(word) for word in sentence if not word in set(stopwords.words('english'))]
        sentence = ' '.join(sentence)
        
        corpus.append(sentence)
        
    cv = CountVectorizer(stop_words = 'english')
    
    x = cv.fit_transform(corpus)
    return x

new_df = pStem(df['text_corrected'])

<h3>Labelling Sentimental Values:</h3>

In [147]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df['overall_sentiment'] = le.fit_transform(df['overall_sentiment'])

# Splitting Data to Apply Models:

In [148]:
from sklearn.model_selection import train_test_split
y = df.iloc[:,1]

x_train, x_test, y_train, y_test = train_test_split(new_df, y, stratify = y, test_size = 0.20, random_state = 5)

# Applying different models to text:

<h2>Using Multinomial Naiyes Bayes:</h2>

In [149]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(x_train, y_train)

#saving model to disk
filename = 'text_mNB.sav'
pickle.dump(classifier, open(filename, 'wb'))

y_pred = classifier.predict(x_test)

pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

mNB_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

mNB_crosstab

y_test,0,1,2,3,4
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3,3,4,0,0
1,25,128,167,7,55
2,59,281,418,20,134
3,0,1,0,0,0
4,7,18,23,2,11


In [150]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred, normalize=True)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

from sklearn.metrics import precision_score
prec = precision_score(y_test, y_pred, average='weighted')

from sklearn.metrics import recall_score
rec = recall_score(y_test, y_pred, average='weighted')

text_mnB_scores = pd.DataFrame({"Accuracy":acc * 100, "Precision":prec * 100, "Recall": rec * 100, "F1":f1 * 100}, index = ["Scores (%)"])
text_mnB_scores

Unnamed: 0,Accuracy,Precision,Recall,F1
Scores (%),40.995608,35.811451,40.995608,36.142935


<h2>Using Logistic Regression: </h2>

In [151]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

#saving model to disk
filename = 'text_lg.sav'
pickle.dump(logistic_model, open(filename, 'wb'))

y_pred = logistic_model.predict(x_test)

pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

logReg_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

logReg_crosstab

y_test,0,1,2,3,4
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2,1,0,3
1,30,131,191,9,63
2,60,284,395,17,122
3,0,0,0,0,1
4,4,14,25,3,11


In [152]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred, normalize=True)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

from sklearn.metrics import precision_score
prec = precision_score(y_test, y_pred, average='weighted')

from sklearn.metrics import recall_score
rec = recall_score(y_test, y_pred, average='weighted')

text_lg_scores = pd.DataFrame({"Accuracy":acc * 100, "Precision":prec * 100, "Recall": rec * 100, "F1":f1 * 100}, index = ["Scores (%)"])
text_lg_scores

Unnamed: 0,Accuracy,Precision,Recall,F1
Scores (%),39.311859,32.729833,39.311859,34.676161


<h2>Using KNN:</h2>

In [153]:
from sklearn.neighbors import KNeighborsRegressor

#new_df = pStem(df['text_corrected'])

#y = df.iloc[:,1]
#x_train, x_test, y_train, y_test = train_test_split(new_df, y, stratify = y, test_size = 0.20, random_state = 5)

KNN_model = KNeighborsRegressor(n_neighbors=3).fit(x_train,y_train)

#saving model to disk
filename = 'text_knn.sav'
pickle.dump(KNN_model, open(filename, 'wb'))

y_pred = KNN_model.predict(x_test) #Predictions on Testing data

y_pred = [round(num) for num in y_pred]

pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

knn_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

knn_crosstab

y_test,0,1,2,3,4
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,14,12,1,6
1,45,159,219,10,78
2,42,226,350,18,105
3,5,31,29,0,11
4,0,1,2,0,0


In [154]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred, normalize=True)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

from sklearn.metrics import precision_score
prec = precision_score(y_test, y_pred, average='weighted')

from sklearn.metrics import recall_score
rec = recall_score(y_test, y_pred, average='weighted')

text_knn_scores = pd.DataFrame({"Accuracy":acc * 100, "Precision":prec * 100, "Recall": rec * 100, "F1":f1 * 100}, index = ["Scores (%)"])
text_knn_scores

Unnamed: 0,Accuracy,Precision,Recall,F1
Scores (%),37.408492,31.372466,37.408492,34.044016


# Reading Images and Applying Feature Extraction:

In [155]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
import os
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import cv2

os.chdir(r'C:\Users\afaqa\Desktop\Intro to DS\Project')
imageDf = pd.read_csv('labels.csv')
imageDf = imageDf.iloc[:,1:]
imageDf.dropna(inplace=True)
imageDf = imageDf.drop(['text_ocr', 'text_corrected'], axis=1)

imageDf = imageDf.reset_index()

names = imageDf['image_name']
new_image_df = pd.DataFrame(columns = range(0, 900))
i = 0
skipped = []
os.chdir(r'C:\Users\afaqa\Desktop\Intro to DS\Project\images')
for name in names:
    try:
        image = cv2.imread(name, cv2.IMREAD_GRAYSCALE )
        #plt.imshow(image)
        size=(30, 30)
        resized_image_feature_vector = cv2.resize(image, size)
        #plt.imshow(resized_image_feature_vector)
        resized_flattened_image_feature_vector = resized_image_feature_vector.flatten()

        new_image_df.loc[i] = resized_flattened_image_feature_vector
        i += 1
    except:
        skipped.append(i)
        continue
    
os.chdir(r'C:\Users\afaqa\Desktop\Intro to DS\Project')

<h3>Labelling Sentimental Values for KNN:</h3>

In [156]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
imageDf['overall_sentiment'] = le.fit_transform(imageDf['overall_sentiment'])

# Applying Different Models to Image:

# Splitting Data to Apply Models:

In [157]:
from sklearn.model_selection import train_test_split

X = new_image_df

Y = imageDf["overall_sentiment"]

for x in skipped:
    Y = Y.drop(x)

x_train, x_test, y_train, y_test = train_test_split(new_image_df, Y, stratify = Y, test_size = 0.20, random_state = 5)

<h2>Using Multinomial Naiyes Bayes:</h2>

In [158]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(x_train, y_train)

#saving model to disk
filename = 'image_mNB.sav'
pickle.dump(classifier, open(filename, 'wb'))

y_pred = classifier.predict(x_test)

pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

mNB_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

mNB_crosstab

y_test,0,1,2,3,4
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,21,97,149,7,53
1,16,80,142,8,44
2,17,59,91,5,21
3,24,77,102,3,34
4,16,119,127,6,48


In [159]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred, normalize=True)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

from sklearn.metrics import precision_score
prec = precision_score(y_test, y_pred, average='weighted')

from sklearn.metrics import recall_score
rec = recall_score(y_test, y_pred, average='weighted')

image_mnB_scores = pd.DataFrame({"Accuracy":acc * 100, "Precision":prec * 100, "Recall": rec * 100, "F1":f1 * 100}, index = ["Scores (%)"])
image_mnB_scores

Unnamed: 0,Accuracy,Precision,Recall,F1
Scores (%),17.789165,32.506548,17.789165,20.591423


<h2>Using Logistic Regression:</h2>

In [160]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', max_iter = 200).fit(x_train, y_train)

#saving model to disk
filename = 'image_lg.sav'
pickle.dump(logistic_model, open(filename, 'wb'))

y_pred = logistic_model.predict(x_test)

pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

logReg_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

logReg_crosstab

y_test,0,1,2,3,4
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6,31,34,1,15
1,24,114,170,6,63
2,48,231,317,19,96
3,9,20,34,1,10
4,7,36,56,2,16


In [161]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred, normalize=True)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

from sklearn.metrics import precision_score
prec = precision_score(y_test, y_pred, average='weighted')

from sklearn.metrics import recall_score
rec = recall_score(y_test, y_pred, average='weighted')

image_lg_scores = pd.DataFrame({"Accuracy":acc * 100, "Precision":prec * 100, "Recall": rec * 100, "F1":f1 * 100}, index = ["Scores (%)"])
image_lg_scores

Unnamed: 0,Accuracy,Precision,Recall,F1
Scores (%),33.235725,32.011075,33.235725,32.339379


<h2>Using KNN:</h2>

In [162]:
from sklearn.neighbors import KNeighborsRegressor

#new_df = pStem(df['text_corrected'])

#y = df.iloc[:,1]
#x_train, x_test, y_train, y_test = train_test_split(new_df, y, stratify = y, test_size = 0.20, random_state = 5)

KNN_model = KNeighborsRegressor(n_neighbors=3).fit(x_train,y_train)

#saving model to disk
filename = 'image_knn.sav'
pickle.dump(KNN_model, open(filename, 'wb'))

y_pred = KNN_model.predict(x_test) #Predictions on Testing data

y_pred = [round(num) for num in y_pred]

pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

knn_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

knn_crosstab

y_test,0,1,2,3,4
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,3,3,0,0
1,29,137,154,12,63
2,50,222,343,14,102
3,15,65,107,3,34
4,0,5,4,0,1


In [163]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred, normalize=True)

from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')

from sklearn.metrics import precision_score
prec = precision_score(y_test, y_pred, average='weighted')

from sklearn.metrics import recall_score
rec = recall_score(y_test, y_pred, average='weighted')

image_knn_scores = pd.DataFrame({"Accuracy":acc * 100, "Precision":prec * 100, "Recall": rec * 100, "F1":f1 * 100}, index = ["Scores (%)"])
image_knn_scores

Unnamed: 0,Accuracy,Precision,Recall,F1
Scores (%),35.431918,33.449112,35.431918,33.532302


In [164]:
total_f1_score = image_knn_scores['F1'] + text_knn_scores['F1']
total_f1_score = total_f1_score + image_lg_scores['F1'] + text_lg_scores['F1']
total_f1_score = total_f1_score + image_mnB_scores['F1'] + text_mnB_scores['F1']
print("Total F1 Score: ", total_f1_score[0])

Total F1 Score:  191.32621517789892


In [None]:
from sklearn import model_selection
import pickle
import pandas as pd

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)