In [None]:
import regex as re
import nltk
import pandas as pd
import os
import string
from tqdm import tqdm

In [None]:
base_dir = os.getcwd()
path = os.path.dirname(base_dir)

In [None]:
def extract_words(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    content = content.lower().splitlines()
    content = [line.split()[0] for line in content if line]
    return content

def extract_sent(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    content = content.lower().replace('\n', ' ')
    content = re.sub(r'https://+\S+', '', content)
    sentences = re.split(r'[.?!:;]\s+', content)
    # content = re.findall(r'\b\w+\b', content)
    return sentences

def text_analysis(sent_list, positive_words, negative_words, stop_words):
    word_list = []
    for sent in sent_list:
        words = re.findall(r'\b\w+\b', sent)
        word_list.extend(words)
    positive_words = list(set(positive_words)-set(stop_words))
    negative_words = list(set(negative_words)-set(stop_words))
    pos = 0
    neg = 0
    for word in word_list:
        if word in positive_words:
            pos += 1
        elif word in negative_words:
            neg += 1
    polarity_score = (pos - neg)/ (pos + neg + 1e-6)
    cleaned_words = [word for word in word_list if word not in stop_words]
    subjectivity_score = (pos + neg) / (len(cleaned_words) + 1e-6)

    def syllable_count(word):
        word = word.lower()
        count = 0
        vowels = "aeiouy"
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1
        return count
    
    complex_words = 0
    for word in word_list:
        if syllable_count(word) >= 3 and word:
            complex_words += 1
    avg_sent_length = len(word_list) / len(sent_list)
    perc_complex_words = complex_words / len(word_list)
    gunning_fog_index = 0.4 * (avg_sent_length + perc_complex_words)
    avg_syllables_per_word = sum(syllable_count(word) for word in word_list) / (len(word_list) + 1e-6)
    avg_word_length = sum(len(word) for word in word_list) / (len(word_list) + 1e-6)
    avg_sent_length = len(word_list) / (len(sent_list) + 1e-6)
    num_personal_pronouns = sum(1 for word in word_list if word in ['i', 'my', 'we', 'us', 'our'])

    return {
        'POSITIVE SCORE': pos,
        'NEGATIVE SCORE': neg,
        'FOG INDEX': gunning_fog_index,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'SYLLABLE PER WORD': avg_syllables_per_word,
        'AVG SENTENCE LENGTH': avg_sent_length,
        'AVG WORD LENGTH': avg_word_length,
        'PERCENTAGE OF COMPLEX WORDS': perc_complex_words,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sent_length,
        'PERSONAL PRONOUNS': num_personal_pronouns,
        'COMPLEX WORD COUNT': complex_words,
        'WORD COUNT': len(word_list)
    }

In [7]:
list_stopwords = []
for file in os.listdir(os.path.join(path, '20211030 Test Assignment\\StopWords')):
    list_stopwords.append(extract_words(os.path.join(path, f'20211030 Test Assignment\\StopWords\\{file}')))

stop_words = []
for sublist in list_stopwords:
    stop_words += sublist
stop_words = list(set(stop_words))

In [8]:
positive_words = list(set(extract_words(os.path.join(path, '20211030 Test Assignment\MasterDictionary\\positive-words.txt')))-set(stop_words))
negative_words = list(set(extract_words(os.path.join(path, '20211030 Test Assignment\MasterDictionary\\negative-words.txt')))-set(stop_words))

  positive_words = list(set(extract_words(os.path.join(path, '20211030 Test Assignment\MasterDictionary\\positive-words.txt')))-set(stop_words))
  negative_words = list(set(extract_words(os.path.join(path, '20211030 Test Assignment\MasterDictionary\\negative-words.txt')))-set(stop_words))


In [39]:
output_df = pd.read_excel(os.path.join(path, '20211030 Test Assignment\\Input.xlsx'), sheet_name='Sheet1')

In [None]:
for file in os.listdir(os.path.join(path, 'Aritcle_txt_files')):
    analysis_dict = text_analysis(extract_sent(os.path.join(path, f'Aritcle_txt_files\\{file}')),
                  positive_words,
                  negative_words)
    

Processing Netclan20241017.txt...
Processing Netclan20241018.txt...
Processing Netclan20241019.txt...
Processing Netclan20241020.txt...
Processing Netclan20241021.txt...
Processing Netclan20241022.txt...
Processing Netclan20241023.txt...
Processing Netclan20241024.txt...
Processing Netclan20241025.txt...
Processing Netclan20241026.txt...
Processing Netclan20241027.txt...
Processing Netclan20241028.txt...
Processing Netclan20241029.txt...
Processing Netclan20241030.txt...
Processing Netclan20241031.txt...
Processing Netclan20241032.txt...
Processing Netclan20241033.txt...
Processing Netclan20241034.txt...
Processing Netclan20241035.txt...
Processing Netclan20241036.txt...
Processing Netclan20241037.txt...
Processing Netclan20241038.txt...
Processing Netclan20241039.txt...
Processing Netclan20241040.txt...
Processing Netclan20241041.txt...
Processing Netclan20241042.txt...
Processing Netclan20241043.txt...
Processing Netclan20241044.txt...
Processing Netclan20241045.txt...
Processing Net

In [33]:
# Loop over each row and fill in the analysis results
for i, row in tqdm(output_df.iterrows(), desc="Processing rows... "):
    url_id = row["URL_ID"]
    
    # Load corresponding sentence list for the article (from file or memory)
    try:
        sent_list = extract_sent(os.path.join(path, f'Aritcle_txt_files\\{url_id}.txt'))
    except FileNotFoundError:
        continue  # Skip if file is missing

    # Analyze the text
    analysis = text_analysis(sent_list, positive_words, negative_words, stop_words)

    # Fill in the DataFrame
    for key, value in analysis.items():
        if key in output_df.columns:
            output_df.at[i, key] = value


Processing rows... : 147it [00:13, 10.84it/s]


In [38]:
output_df.to_csv(os.path.join(path, '20211030 Test Assignment\\Output.csv'), index=False)

In [13]:
sent_list = extract_sent(r'C:\Users\nitee\OneDrive\Desktop\VS CODE\BlackCoffer-TestAssginment\Aritcle_txt_files\Netclan20241049.txt')
text_analysis(sent_list, positive_words, negative_words, stop_words)

{'POSITIVE SCORE': 88,
 'NEGATIVE SCORE': 12,
 'FOG INDEX': 5.785023037141514,
 'POLARITY SCORE': 0.7599999924,
 'SUBJECTIVITY SCORE': 0.07570022704337605,
 'SYLLABLE PER WORD': 2.0117536426837077,
 'AVG SENTENCE LENGTH': 14.179999905466667,
 'AVG WORD LENGTH': 6.056887632319282,
 'PERCENTAGE OF COMPLEX WORDS': 0.2825575928537847,
 'AVG NUMBER OF WORDS PER SENTENCE': 14.179999905466667,
 'PERSONAL PRONOUNS': 1,
 'COMPLEX WORD COUNT': 601,
 'WORD COUNT': 2127}

In [44]:
df = pd.read_csv(r'C:\Users\nitee\OneDrive\Desktop\VS CODE\BlackCoffer-TestAssginment\20211030 Test Assignment\Output.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   URL_ID                            147 non-null    object 
 1   URL                               147 non-null    object 
 2   POSITIVE SCORE                    146 non-null    float64
 3   NEGATIVE SCORE                    146 non-null    float64
 4   POLARITY SCORE                    146 non-null    float64
 5   SUBJECTIVITY SCORE                146 non-null    float64
 6   AVG SENTENCE LENGTH               146 non-null    float64
 7   PERCENTAGE OF COMPLEX WORDS       146 non-null    float64
 8   FOG INDEX                         146 non-null    float64
 9   AVG NUMBER OF WORDS PER SENTENCE  146 non-null    float64
 10  COMPLEX WORD COUNT                146 non-null    float64
 11  WORD COUNT                        146 non-null    float64
 12  SYLLABLE