In [1]:
import numpy as np
import pandas as pd
import os
import cv2
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import zipfile

zip_file_path = '/content/drive/MyDrive/Colab Notebooks/alphabets_dataset.zip'

extract_folder = '/content/extracted_files'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print('Files extracted to:', extract_folder)

Files extracted to: /content/extracted_files


In [4]:
import shutil
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/target_labels.csv'

destination_path = '/content/target_labels.csv'
shutil.copy(csv_file_path, destination_path)

'/content/target_labels.csv'

In [5]:
images_path = '/content/extracted_files/alphabet_images'
labels_file = '/content/extracted_files/alphabet_labels.csv'

labels_df = pd.read_csv(labels_file)
print(labels_df.head())
print(labels_df.shape)

image_files = [f for f in os.listdir(images_path) if os.path.isfile(os.path.join(images_path, f)) and f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
print(f"Number of image files: {len(image_files)}")

image_size = (28, 28)

def load_images_and_labels():
    images = []
    labels = []
    for index, row in labels_df.iterrows():
        filename = row['file']
        label = row['label']
        img_path = os.path.join(images_path, filename)
        if os.path.exists(img_path):
            img = tf.keras.preprocessing.image.load_img(img_path, color_mode='grayscale', target_size=image_size)
            img = tf.keras.preprocessing.image.img_to_array(img) / 255.0
            images.append(img)
            labels.append(label)
        else:
            print(f"Image {img_path} not found.")
    return np.array(images), np.array(labels)

images, labels = load_images_and_labels()

unique_labels = sorted(set(labels))
label_to_index = {label: index for index, label in enumerate(unique_labels)}
indexed_labels = np.array([label_to_index[label] for label in labels])

          file label
0  image_1.png     A
1  image_2.png     A
2  image_3.png     A
3  image_4.png     A
4  image_5.png     A
(372451, 2)
Number of image files: 372451


In [6]:

from sklearn.model_selection import train_test_split
train_images, val_images, train_labels, val_labels = train_test_split(images, indexed_labels, test_size=0.2, random_state=42)

In [10]:
print(f"Total dataset size: {len(images)}")
print(f"Training dataset size: {len(train_images)}")
print(f"Validation dataset size: {len(val_images)}")


Total dataset size: 372451
Training dataset size: 297960
Validation dataset size: 74491


In [13]:
from tensorflow.keras import layers, models

cnn_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(unique_labels), activation='softmax')
])


cnn_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


history = cnn_model.fit(train_images, train_labels, epochs=10, validation_data=(val_images, val_labels))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


NameError: name 'accuracy' is not defined

In [14]:

def predict_letter(image, model, image_size=(28, 28)):  # Function to predict letter from a segmented image
    img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, image_size)
    if np.mean(img) == 0.0:
        return ' '
    img = img.reshape((1, image_size[0], image_size[1], 1)) / 255.0
    prediction = model.predict(img)
    predicted_letter = chr(np.argmax(prediction) + 65)
    return predicted_letter

def segment_image(image_path, segment_width=28, segment_height=28):
# Function to segment image into letters using fixed width and height
    image = cv2.imread(image_path)
    height, width = image.shape[:2]
    segments = []

    for y in range(0, height, segment_height):
        for x in range(0, width, segment_width):
            segment = image[y:y + segment_height, x:x + segment_width]
            if segment.shape[0] == segment_height and segment.shape[1] == segment_width:
                segments.append(segment)

    return segments


def predict_sentence(image_path, model):  # Function to predict sentence from an image using segment_image and predict_letter
    segments = segment_image(image_path)
    sentence = ''.join([predict_letter(segment, model) for segment in segments])
    return sentence

In [16]:
import os
images_path = '/content/drive/MyDrive/Colab Notebooks/target_images'
os.listdir(images_path)

import shutil
local_images_path = '/content/images_folder'
shutil.copytree(images_path, local_images_path)
os.listdir(local_images_path)

['line_5.png',
 'line_2.png',
 'line_1.png',
 'line_4.png',
 'line_3.png',
 'line_6.png']

In [17]:

target_images_folder = 'target_images'

target_labels = pd.read_csv('target_labels.csv')

true_labels = target_labels['sentiment'].tolist()


In [31]:

def predict_sentiment(text, model):   # Function to predict sentiment for extracted texts
    return model.predict([text])[0]

In [32]:

ocr_predictions = []
for i in range(1, 7):  # 6 image files from line_1.png to line_6.png
    image_file = f'/content/images_folder/line_{i}.png'
    image_path = os.path.join(target_images_folder, image_file)

    print(f"Processing image: {image_file}")

    recognized_text = predict_sentence(image_path, cnn_model)
    ocr_predictions.append(recognized_text)

    print(f"Recognized sentence: {recognized_text}")


ocr_accuracy = accuracy_score(true_labels, ocr_predictions)
print(f'Final OCR Accuracy: {ocr_accuracy:.2f}')


Processing image: /content/images_folder/line_1.png
Recognized sentence: I AM REALLY ANNOYED BY YOUR CONSTANT COMPLAINING AND YOU NEVER OFFER ANY SOLUTIONS WHICH IS VERY UNHELPFUL AND NEGATIVE 
Processing image: /content/images_folder/line_2.png
Recognized sentence: IT IS FRUSTRATENG THAT YOU NEVER PAY ATTENTION DURING DISCUSSIONS AND YOUR LACK OF FOCUS IS REALLY AFFECTING OUR PROGRESS         
Processing image: /content/images_folder/line_3.png
Recognized sentence: E AM DELIGHTED BY YOUR FRIENDLINESS AND YOU ALWAYS MAKE EVERYONE FEEL WELCOME WHICH FOSTERS A SENSE OF COMMUNITY        
Processing image: /content/images_folder/line_4.png
Recognized sentence: IT IS WONDERFUL THAT YOU ALWAYS SHOW KINDNESS ANO YOUR EMPATHY TOWARDS OTHERS IS TRULY HEARTWARMINT AND APPRECIATED     
Processing image: /content/images_folder/line_5.png
Recognized sentence: YOUR ANALYSIS OF THE DATA WAS ACCURATE AND WELL PRESENTED PROVIDING A CLEAR UNDERSTANDING OF THE TRENDS AND PATTERNS    
Processing image: /c

In [25]:
import shutil
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/sentiment_analysis_dataset.csv'

destination_path = '/content/sentiment_analysis_dataset.csv'
shutil.copy(csv_file_path, destination_path)

'/content/sentiment_analysis_dataset.csv'

In [33]:

sentiment_data = pd.read_csv('sentiment_analysis_dataset.csv')


X = sentiment_data['line']
Y = sentiment_data['sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
# Train a sentiment analysis model (TF-IDF + Logistic Regression)
pipeline.fit(X_train, Y_train)

Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Sentiment Analysis Model Accuracy: {accuracy * 100:.2f}%")

Sentiment Analysis Model Accuracy: 66.67%


In [30]:



extracted_texts = ocr_predictions
predicted_sentiments = [predict_sentiment(text, pipeline) for text in extracted_texts]

for i in range(1, 7):  # Assuming 6 image files from line_1.png to line_6.png

    print(f"Predicted sentiment: {predicted_sentiments}")

sentiment_accuracy = accuracy_score(true_labels, predicted_sentiments)
# Evaluate sentiment analysis accuracy on OCR predictions
print(f"Sentiment Analysis Accuracy on OCR Predictions: {sentiment_accuracy * 100:.2f}%")


Predicted sentiment: ['Happy', 'Angry', 'Happy', 'Happy', 'Neutral', 'Neutral']
Predicted sentiment: ['Happy', 'Angry', 'Happy', 'Happy', 'Neutral', 'Neutral']
Predicted sentiment: ['Happy', 'Angry', 'Happy', 'Happy', 'Neutral', 'Neutral']
Predicted sentiment: ['Happy', 'Angry', 'Happy', 'Happy', 'Neutral', 'Neutral']
Predicted sentiment: ['Happy', 'Angry', 'Happy', 'Happy', 'Neutral', 'Neutral']
Predicted sentiment: ['Happy', 'Angry', 'Happy', 'Happy', 'Neutral', 'Neutral']
Sentiment Analysis Accuracy on OCR Predictions: 83.33%
