1.Importing Files

In [None]:
# connect to Google Drive
from google.colab import drive
drive.mount('/drive')

In [None]:
# looked at files in directory
!ls /drive/MyDrive/files_coding

In [None]:
# open file c_1984.txt
f1 = open('/drive/MyDrive/files_coding/c_1984.txt')
# store f1 in c1
c1 = f1.read()
# open file Frankenstein.txt
f2 = open('/drive/MyDrive/files_coding/Frankenstein.txt')
# store f2 in c2
c2 = f2.read()

In [None]:
# check content 1
c1

In [None]:
# check content 2
c2

2.Tokenize

In [None]:
# import nltk library
import nltk
# download lexical data base for the English language
nltk.download('wordnet')
# import tools for tokenization
from nltk.tokenize import word_tokenize
nltk.download('punkt')

In [None]:
#tokenize c1 and c2
tk1 = nltk.word_tokenize(c1.lower())
tk2 = nltk.word_tokenize(c2.lower())

In [None]:
# check tk1
tk2

In [None]:
# check tk2
tk2

3.Cleaning Tokens for NLP

In [None]:
import string
# create an extended punctuation set to get rid of unwanted symbols
#.union for other unwanted symblos that are not in ep
ep = set(string.punctuation).union({'--', 's', "’", "'s", '“', '”'})

# check which symbolas already in extended punctuation
print("What is included in extended punctuation:")
count = 0
for symbol in ep:
    print(symbol, end=" ")
    count += 1
    if count % 5 == 0:
        print()

In [None]:
tk1wp = [w for w in tk1 if w not in ep]
tk2wp = [w for w in tk2 if w not in ep]

In [None]:
#check frist tokens without punctuation
tk1wp

In [None]:
# check second tokens without punctuation
tk2wp

In [None]:
# import stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
# create list of tokens that are not in stopwords and extnded punctuation
tk1wsw =[w for w in tk1wp if w not in stopwords.words('english')]
tk2wsw =[w for w in tk2wp if w not in stopwords.words('english')]

In [None]:
# check first cleaned tokens
tk1wsw

In [None]:
# check second cleaned tokens
tk2wsw

4.Preparing Tokens for analysys

In [None]:
# import stemming algorythm
from nltk.stem.porter import PorterStemmer
# apply stemming
stk1 = [PorterStemmer().stem(w) for w in tk1wsw]
stk2 = [PorterStemmer().stem(w) for w in tk2wsw]

In [None]:
# check stemmed tokens 1
stk1

In [None]:
# check stemmed tokens 2
stk2

In [None]:
# import lemmatization algorythm
from nltk.stem import WordNetLemmatizer
# apply lemmatization
tkl1 = [WordNetLemmatizer().lemmatize(w, "n") for w in tk1wsw]
tkl2 = [WordNetLemmatizer().lemmatize(w, "n") for w in tk2wsw]

In [None]:
# check lammatized tokens 1
tkl1

In [None]:
# check lemmatized tokens 2
tkl2

5. Analyze Frequency of words

In [None]:
# import counter from collections
from collections import Counter
# count the mmost common tokens
cnt1 = Counter(stk1).most_common(10)
cnt2 = Counter(stk2).most_common(10)

In [None]:
# check most common tokens 1
cnt1

In [None]:
# check most common tokens 2
cnt2

In [None]:
# Visualize
import matplotlib.pyplot as plt

labels1, values1 = zip(*cnt1)
labels2, values2 = zip(*cnt2)

plt.figure(figsize=(10, 5))
plt.bar(labels1, values1, color='blue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Most common stemmed tokens in 1984')
# rotation for easier viewing
plt.xticks(rotation=45)
# numerate values
for i, v in enumerate(values1):
    plt.text(i, v + 0.1, str(v), ha='center')
# show the plot
plt.show()

plt.figure(figsize=(10, 5))
plt.bar(labels2, values2, color='red')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Most common stemmed tokens in Frankenstein')
plt.xticks(rotation=45)


for i, v in enumerate(values2):
    plt.text(i, v + 0.1, str(v), ha='center')

plt.show()


Same plot for lemmatized tokens for comparison

In [None]:
cnt1_lemmatized = Counter(stk1).most_common(10)
cnt2_lemmatized = Counter(stk2).most_common(10)

In [None]:
labels1, values1 = zip(*cnt1_lemmatized)
labels2, values2 = zip(*cnt2_lemmatized)

plt.figure(figsize=(10, 5))
plt.bar(labels1, values1, color='blue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Most common lemmatized tokens in 1984')
plt.xticks(rotation=45)

for i, v in enumerate(values1):
    plt.text(i, v + 0.1, str(v), ha='center')

plt.show()

plt.figure(figsize=(10, 5))
plt.bar(labels2, values2, color='red')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Most common lemmatized tokens in Frankenstein')
plt.xticks(rotation=45)


for i, v in enumerate(values2):
    plt.text(i, v + 0.1, str(v), ha='center')

plt.show()

6.Analysis - Similarities Versus

Version A

In [None]:
# import packages for similarity analysys
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
books = [tk1wsw, tk2wsw]
# model for calculation
model = Word2Vec(books, min_count=1)
# create vectors
book1_vector = model.wv[tk1wsw]
book2_vector = model.wv[tk2wsw]

# identify similarities
# check overall similarity
similarity_scores = cosine_similarity(book1_vector, book2_vector)
print("Similarity_scores:")
print(similarity_scores)
overall_similarity = np.mean(similarity_scores)
print("Overall similarity: ", overall_similarity)

Similarity_scores:
[[ 0.20701683  0.08079749 -0.1707272  ...  0.02707134 -0.11814065
  -0.09474735]
 [ 0.02318923 -0.02310109 -0.06204247 ...  0.19780672  0.01102354
  -0.0352779 ]
 [ 0.21562698  0.01395856  0.03951131 ...  0.07064446  0.09430897
   0.10130158]
 ...
 [-0.02813942 -0.02208235 -0.11588963 ... -0.04895997  0.22399652
  -0.14175871]
 [ 0.09384546  0.03260145 -0.00974998 ...  0.026046    0.02924124
   0.1071071 ]
 [ 0.07910256  0.04661493  0.08799782 ... -0.11732103 -0.11337622
   0.01238563]]
Overall similarity:  0.0010516167


In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

In [None]:
# perform Singular Value Decomposition
svd = TruncatedSVD(n_components=2)
reduced_similarity = svd.fit_transform(similarity_scores)

In [None]:
# scale the reduced similarity scores to therange [-1, 1]
scaler = MinMaxScaler(feature_range=(-1, 1))
reduced_similarity_scaled = scaler.fit_transform(reduced_similarity)
# check the reduced similarity scores
print("Reduced Similarity Scores (2 dimensions):")
print(reduced_similarity_scaled)


In [None]:
# Extract x and y coordinates
x = [row[0] for row in reduced_similarity_scaled]
y = [row[1] for row in reduced_similarity_scaled]

In [None]:
#Visualize
plt.figure(figsize=(8, 6))
plt.scatter(x, y, color='blue')
plt.title('Reduced Similarity Scores Visualization')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend()
plt.grid(True)
plt.show()


Version B

In [None]:
# import packages from sklearn for similarity analysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# in this version use raw content data before tokenization
texts = [c1, c2]

# preprocess text with lower()
preprocessed_text = [text.lower() for text in texts]

# transform the text into a vector
# creating Bag-of-Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_text)
# calculate cosine similarity
cosine_sim = cosine_similarity(X)

In [None]:
cosine_sim

In [None]:
# Visualize
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors

# Improve the color palette
custom_palette = sns.light_palette((210, 90, 60), input="husl")

# Create a heatmap with shadow and borders
plt.figure(figsize=(10, 8))
sns.heatmap(cosine_sim, annot=True, cmap=custom_palette,
            linewidths=1, linecolor='grey', fmt=".2f",
            annot_kws={"size": 14})

# Add a gradient background
plt.gca().patch.set_facecolor('lightgrey')

# Add a title with animation
plt.text(0.5, 1.05, 'Cosine Similarity between 1984 and Frankenstein', size=20, ha='center', transform=plt.gca().transAxes)

# Display the heatmap
plt.show()



7.Sentiment Analysys

In [None]:
from textblob import TextBlob

In [None]:
# Function to analyse sentiment
def analyze_sentiment(txt):
  polarity = TextBlob(txt).polarity
  if polarity > 0:
    sentiment_label = "positive"
  elif polarity < 0:
    sentiment_label = "negative"
  else:
    sentiment_label = "neutral"

  return sentiment_label


In [None]:
# Look at sentiments
for text in tkl1:
  sentiment = analyze_sentiment(text)
  print(f"Text: '{text}' | Sentiment : '{sentiment}'" )

In [None]:
# Initialize counters for sentiment labels
sentiment_counts = {"positive": 0, "negative": 0, "neutral": 0}


In [None]:
# Analyse sentiment for each token 1
for text in tkl1:
    sentiment1 = analyze_sentiment(text)
    sentiment_counts[sentiment1] += 1

In [None]:
# Visualize
labels = sentiment_counts.keys()
counts = sentiment_counts.values()
plt.pie(counts, labels=labels, colors=['green', 'red', 'gray'], autopct='%1.1f%%')
plt.title('Sentiment Analysis Results for 1984')
plt.show()


In [None]:
# Analyse sentiment for each text 1
for text in tkl2:
    sentiment2 = analyze_sentiment(text)
    sentiment_counts[sentiment2] += 1

In [None]:
# Visualize
labels = sentiment_counts.keys()
counts = sentiment_counts.values()
plt.pie(counts, labels=labels, colors=['green', 'red', 'gray'], autopct='%1.1f%%')
plt.title('Sentiment Analysis Results for Frankenstein')
plt.show()


8.Named Entity Recognition Analysys

In [None]:
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")
# Process the text with the loaded model
doc = nlp(c1)
for ent in doc.ents:
  print(ent.text, ent.label_)

In [None]:
# Display the text
from spacy import displacy
from IPython.core.display import display, HTML
html = displacy.render(doc, style ="ent", page=False)
display(HTML(html))


In [None]:
# Aggregate
import pandas as pd

In [None]:
# Initialize a dictionary to store named entity counts
label_counts = {}
# Iterate over the named entities and count the labels
for ent in doc.ents:
  label = ent.label_
  label_counts[label] = label_counts.get(label, 0) + 1
df = pd.DataFrame(list(label_counts.items()), columns=['Named Entity Label', 'Count'])
print("1984", df)

Same code for c2


In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(c2)
for ent in doc.ents:
  print(ent.text, ent.label_)

In [None]:
from spacy import displacy
from IPython.core.display import display, HTML
html = displacy.render(doc, style ="ent", page=False)
display(HTML(html))

In [None]:
label_counts = {}
for ent in doc.ents:
  label = ent.label_
  label_counts[label] = label_counts.get(label, 0) + 1
df = pd.DataFrame(list(label_counts.items()), columns=['Named Entity Label', 'Count'])
print(df)

9.Sentences analysis

In [None]:
books = {'1984': c1, 'Frankenstein': c2}

# Iterate over the content of each file
for book_name, text in books.items():
    # Process the text with the loaded model
    doc = nlp(text)

    # Initialize a list to store the lengths of sentences
    sentence_lengths = []

    # Iterate over the sentences in the document
    for sent in doc.sents:
        # Append the length of each sentence to the list
        sentence_lengths.append(len(sent))

    # Print statistics
    print(f"\n{book_name}:")
    print(f"Total number of sentences: {len(sentence_lengths)}")
    print(f"Average sentence length: {sum(sentence_lengths) / len(sentence_lengths)}")
    print(f"Maximum sentence length: {max(sentence_lengths)}")
    print(f"Minimum sentence length: {min(sentence_lengths)}")

