## Text analysis

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
import os
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

In [2]:
# loading articles
folder = 'C:\\Users\\rohit\\Python\\drive-download-20230909T190744Z-001\\extracted_articles'

filenames = []
contents = []

for filename in os.listdir(folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
        filenames.append(filename.strip('.txt'))
        contents.append(file_content)

df = pd.DataFrame({'article_id': filenames, 'article': contents})
df.head()

Unnamed: 0,article_id,article
0,10282.6,“machine intelligence is the last invention th...
1,10744.4,introduction where is this disruptive techn...
2,11206.2,in future or in upcoming years humans and mach...
3,12129.8,machine learning techniques may have been used...
4,123.0,"telemedicine, the use of technology to diagnos..."


#### Tokenization

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))

#### Stopwords removal

In [4]:
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
data = df.article.values.tolist()

data_words = remove_stopwords(data)

#### Positive score

In [6]:
# getting positive words
pos = pd.read_csv(r'C:\Users\rohit\Python\drive-download-20230909T190744Z-001\MasterDictionary\positive-words.txt', header = None)
pos.columns = ['words']

In [7]:
# calculating scores
pos_score = []
for words in data_words:
    score = 0
    for word in words:
        if word in list(pos.words):
            score += 1
    pos_score.append(score)

In [8]:
# adding positive score as a new column in dataframe
df['positive_score'] = pos_score

#### Negative score

In [9]:
# geting negative words
neg = pd.read_csv(r'C:\Users\rohit\Python\drive-download-20230909T190744Z-001\MasterDictionary\negative-words.txt', header = None, encoding='latin-1')
neg.columns = ['words']

In [10]:
# calculating scores
neg_score = []
for words in data_words:
    score = 0
    for word in words:
        if word in list(neg.words):
            score += 1
    neg_score.append(score)

In [11]:
# adding negaitve score as a new column in dataframe
df['negative_score'] = neg_score

#### Polarity score

In [12]:
df['polarity_score'] = (df['positive_score'] - df['negative_score'])/((df['positive_score'] + df['negative_score']) + 0.000001)

#### Subjectivity score

In [13]:
total_words = [len(words) for words in data_words]
df['subjectivity_score'] = (df['positive_score'] + df['negative_score'])/total_words

#### Average sentence length

In [14]:
df['average_sentence_length'] = [total/len(data_words) for total in total_words]

#### Complex word percentage

In [15]:
# assuming that a complex word is a word which is more than 10 characters long
complex_words_list = [[word for word in words if len(word) > 10] for words in data_words]

# Calculate the percentage of complex words
complex_words_percentages = [(len(complex_words)/sum(total_words))*100 for complex_words in complex_words_list]
df['complex_words_percentages'] = complex_words_percentages

#### Fog index

In [16]:
fog_sum = df['average_sentence_length'] + df['complex_words_percentages']
df['fog_index'] = 0.4 * fog_sum

#### Average number of words per sentence

In [17]:
# average number is words per sentence is same as average_sentence_length which we have already calculated

#### Complex words count

In [18]:
complex_words_count = df['complex_words_percentages'] * total_words
df['complex_words_count'] = complex_words_count

#### Word count

In [19]:
df['word_count'] = total_words

#### Syllable count per word

In [20]:
syllables = []
for words in data_words:
    vowels = 0
    for word in words:
        vowel_count = len([char for char in word if char in 'aeiouy'])
        vowels += vowel_count
    avg_vowel = vowels/len(words)
    syllables.append(avg_vowel)

df['syllabel_per_word'] = syllables

#### Personal pronouns

In [21]:
pronouns = ['i', 'we', 'my', 'ours', 'us']
pronouns_count = [len([word for word in words if word in pronouns]) for words in data_words]
df['personal_pronouns'] = pronouns_count

#### Average word length

In [22]:
avg_len = [sum(len(word) for word in words) / len(words) for words in data_words]
df['average_word_length'] = avg_len

In [23]:
df.head()

Unnamed: 0,article_id,article,positive_score,negative_score,polarity_score,subjectivity_score,average_sentence_length,complex_words_percentages,fog_index,complex_words_count,word_count,syllabel_per_word,personal_pronouns,average_word_length
0,10282.6,“machine intelligence is the last invention th...,71,29,0.42,0.105374,9.303922,0.107839,3.764704,102.338934,949,2.696523,6,6.808219
1,10744.4,introduction where is this disruptive techn...,58,26,0.380952,0.123348,6.676471,0.072864,2.699734,49.620379,681,2.657856,3,6.722467
2,11206.2,in future or in upcoming years humans and mach...,28,12,0.4,0.099502,3.941176,0.03206,1.589295,12.888183,402,2.671642,2,6.718905
3,12129.8,machine learning techniques may have been used...,44,14,0.517241,0.147583,3.852941,0.017487,1.548171,6.872532,393,2.442748,0,6.3257
4,123.0,"telemedicine, the use of technology to diagnos...",89,24,0.575221,0.113682,9.745098,0.189446,3.973818,188.309701,994,3.056338,0,7.338028


In [24]:
df.to_csv('output.csv', index = False)