<h2> Optical Charater Recognition </h2>

In [2]:
import cv2 as cv
from PIL import Image
import pytesseract
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import glob 

%matplotlib inline

In [3]:
#function to resize the images to a larger scale for better visibility

def resize_image(image,scale = 1.5):
    width = int(image.shape[0]*scale)
    height =  int(image.shape[1]*scale)
    dimension = (width,height)
    return cv.resize(image,dimension,interpolation = cv.INTER_AREA)

In [4]:
#function to read images from the specified path and recognize text in the same using pytesseract

def text_recognition(path_list):
    text = []
    for path in path_list:
        im = cv.imread(path)
        try:
            im = resize_image(im)
        except:
            continue
        text.append(pytesseract.image_to_string(im))
    return text

In [5]:
#function to clean the retrieved text

def text_preprocessing(text_list):
    empty_list = []
    char_list = ['\n','|','@','~','+','*','/','..','  ','}','(','«','   ','\\','©','—','#']
    for text in text_list:
        for ch in char_list:
            text = text.replace(ch,' ')
        empty_list.append(text)
    return empty_list

In [6]:
#function to set the labels for memes,quotes and return pandas dataframe

def set_labels(text_list,label):
    d = {}
    d['Text'] = text_list
    d['Labels']= [label]* len(text_list)
    df = pd.DataFrame(d)
    return df

In [7]:
#returning the list of image paths

meme_paths = glob.glob('..\Desktop\Memes\*.jpg')
quote_paths = glob.glob('..\Desktop\Quotes\*.jpg')

In [8]:
meme_text = text_recognition(meme_paths)
quote_text = text_recognition(quote_paths)

In [9]:
quote_text[:10]

['Everyone says love hurts, but that is\nnot true, Loneliness hurts. Rejection\nhurts, Losing someone hurts. Envy\nhurts. Everyone gets these things\nconfused with love, but in reality love\nis the only thing in this world that\ncovers up all pain and makes\nsomeone feel wonderful again.\nLove is the only thing in this world\n\nthat does not hurt. #-:—~\n\nLove Wide Open\n\n(heat of hing ert Se\n',
 'OF HAPPINESS.\na Te ee\n',
 'Look for something\npositive in every\nday, even if some days\nyou have to look\na little harder.\n\n',
 '',
 '——\n\nHONEST RELATIONSHIPS\n\nWith-holding\nuncomfortable facts, feelin\n\nanne\ngreatest type of\n\n— known\n',
 'God\n\nis never\n\ntoo busy\n\nto hear\n\nYour prayers.\n',
 "Being happy doesn’t mean\neverything is perfect.\nIt means you've decided\nto look beyond the\nimperfections.\n\n",
 "Goa has a perfect\ntiming; never early,\nnever late. It take\nlittle patience an\n\nfaith, but it's worth\nthe wait...\n",
 'Karma\n\nhink goo joughts, say\n_nic

In [10]:
new_memes = text_preprocessing(meme_text)
new_quotes = text_preprocessing(quote_text)

In [11]:
new_quotes[:10]

['Everyone says love hurts, but that is not true, Loneliness hurts. Rejection hurts, Losing someone hurts. Envy hurts. Everyone gets these things confused with love, but in reality love is the only thing in this world that covers up all pain and makes someone feel wonderful again. Love is the only thing in this world that does not hurt.  -:   Love Wide Open  heat of hing ert Se ',
 'OF HAPPINESS. a Te ee ',
 'Look for something positive in every day, even if some days you have to look a little harder. ',
 '',
 '   HONEST RELATIONSHIPS With-holding uncomfortable facts, feelin anne greatest type of   known ',
 'God is never too busy to hear Your prayers. ',
 "Being happy doesn’t mean everything is perfect. It means you've decided to look beyond the imperfections. ",
 "Goa has a perfect timing; never early, never late. It take little patience an faith, but it's worth the wait . ",
 'Karma hink goo joughts, say _nice things, do good for others. Everything comes back. ',
 '_ The way to” SS 

In [12]:
memes_df = set_labels(new_memes,1)
quotes_df = set_labels(new_quotes,0)

In [13]:
#dropping empty rows from the dataframes

memes_df.drop(memes_df[memes_df['Text']==''].index,inplace=True)
quotes_df.drop(quotes_df[quotes_df['Text']==''].index,inplace=True)

In [14]:
#Merging two dataframes into one and shuffling the rows

text_df = pd.concat([memes_df,quotes_df.iloc[:611,:]],axis=0).sample(frac = 1).reset_index()
text_df.drop(['index'],axis=1,inplace=True)

In [15]:
text_df.head(10)

Unnamed: 0,Text,Labels
0,Every time you get Dressed remember: If you Di...,1
1,"TPve learned that no matter how much I care, s...",0
2,It all begins and ends in your mind. What you ...,0
3,"When you complain, you make yourself a victim....",0
4,OB :. why do you still ive love to everyone? O...,0
5,The problem with the world is that the i intel...,0
6,My silence means am tired of fighting and now...,0
7,"Eso is just a small three letter word, which c...",0
8,"Stop thinking too much, It’s alright not to kn...",0
9,"Don't be impressed by money, followers, degree...",0


In [16]:
text_df.to_csv('..\Desktop\Meme_Identification\meme_quote.csv')