## Extracting the data

### We can use beautiful soop for extracting the required data from a given URL


In [1]:
import os
import requests
from bs4 import BeautifulSoup

def extract_article(url, url_id, folder_name="articles"):
    try:
        os.makedirs(folder_name, exist_ok=True)
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch URL: {url}, Status Code: {response.status_code}")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else "Untitled"
        paragraphs = soup.find_all("p")
        article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])
        content = f"{title}\n\n{article_text}"
        file_name = os.path.join(folder_name, f"{url_id}.txt")
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(content)
        print(f"Article saved to {file_name}")
    except Exception as e:
        print(f"An error occurred: {e}")




In [21]:
import pandas as pd
file_path = "Input.xlsx - Sheet1.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...
...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...


In [24]:
import pandas as pd

for index, row in df.iterrows():
    url_id = row['URL_ID']  
    url = row['URL']       
    
    extract_article(url, url_id) 
    print(index+1)


Article saved to articles\Netclan20241017.txt
1
Article saved to articles\Netclan20241018.txt
2
Article saved to articles\Netclan20241019.txt
3
Article saved to articles\Netclan20241020.txt
4
Article saved to articles\Netclan20241021.txt
5
Article saved to articles\Netclan20241022.txt
6
Article saved to articles\Netclan20241023.txt
7
Article saved to articles\Netclan20241024.txt
8
Article saved to articles\Netclan20241025.txt
9
Article saved to articles\Netclan20241026.txt
10
Article saved to articles\Netclan20241027.txt
11
Article saved to articles\Netclan20241028.txt
12
Article saved to articles\Netclan20241029.txt
13
Article saved to articles\Netclan20241030.txt
14
Article saved to articles\Netclan20241031.txt
15
Article saved to articles\Netclan20241032.txt
16
Article saved to articles\Netclan20241033.txt
17
Article saved to articles\Netclan20241034.txt
18
Article saved to articles\Netclan20241035.txt
19
Article saved to articles\Netclan20241036.txt
20
Article saved to articles\Net

## Token Generation and Filtering the Tokens

### For diving the text data into tokens we can use nltk package 

In [2]:
pip install nltk


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp39-cp39-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 1.4 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.0 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp39-cp39-win_amd64.whl (274 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6
Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pickle
from collections import Counter

In [4]:
tokenizer = nltk.RegexpTokenizer(r'\w+')

### Tokenizing and removing ,filtering using stopwords

In [None]:
import os

def load_stop_words(folder_path):
    stop_words = set()
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith('.txt'):  # Ensure only .txt files are read
            print(f"Reading file: {file_name}")  # Debug: Print the file being processed
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    words = file.read().splitlines()
                    valid_words = [word.strip().lower() for word in words if word.strip()]
                    print(f"Loaded {len(valid_words)} words from {file_name}")  # Debug: Number of words loaded
                    stop_words.update(valid_words)
            except Exception as e:
                print(f"Error reading file {file_name}: {e}")
    print(f"Total stop words loaded: {len(stop_words)}")  # Debug: Total stop words loaded
    return stop_words

def filter_stop_words(text, stop_words):
    tokens = text.split() 
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return " ".join(filtered_tokens)

# Example Usage
stop_words_folder = "StopWords"  # Replace with the correct folder path
stop_words = load_stop_words(stop_words_folder)

Reading file: StopWords_Auditor.txt
Loaded 8 words from StopWords_Auditor.txt
Reading file: StopWords_Currencies.txt
Loaded 85 words from StopWords_Currencies.txt
Reading file: StopWords_DatesandNumbers.txt
Loaded 109 words from StopWords_DatesandNumbers.txt
Reading file: StopWords_Generic.txt
Loaded 121 words from StopWords_Generic.txt
Reading file: StopWords_GenericLong.txt
Loaded 571 words from StopWords_GenericLong.txt
Reading file: StopWords_Geographic.txt
Loaded 199 words from StopWords_Geographic.txt
Reading file: StopWords_Names.txt
Loaded 13014 words from StopWords_Names.txt
Total stop words loaded: 12768


In [3]:
len(stop_words)

12768

In [23]:
import os
import nltk
def tokenize(document, stopwords):
    document = document.lower()
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    tokens = [token for token in tokenizer.tokenize(document) if token not in stopwords]
    return tokens

def tokenize_file(file_location, stopwords):
    with open(file_location, 'r', encoding='utf-8', errors='ignore') as file:
        document = file.read()
    tokens = tokenize(document, stopwords)
    return tokens

def process_folder(input_folder, stopwords, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.txt'):
            file_path = os.path.join(input_folder, file_name)
            tokens = tokenize_file(file_path, stopwords)
            
            output_file_path = os.path.join(output_folder, file_name)  # Save with the same name
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(' '.join(tokens))
            
            print(f"Processed and saved: {file_name}")

stop_words_folder = "StopWords" 
input_folder = "articles"  
output_folder = "CleanedTextFiles"  

stopwords = load_stop_words(stop_words_folder)
process_folder(input_folder, stopwords, output_folder)


Reading file: StopWords_Auditor.txt
Loaded 8 words from StopWords_Auditor.txt
Reading file: StopWords_Currencies.txt
Loaded 85 words from StopWords_Currencies.txt
Reading file: StopWords_DatesandNumbers.txt
Loaded 109 words from StopWords_DatesandNumbers.txt
Reading file: StopWords_Generic.txt
Loaded 121 words from StopWords_Generic.txt
Reading file: StopWords_GenericLong.txt
Loaded 571 words from StopWords_GenericLong.txt
Reading file: StopWords_Geographic.txt
Loaded 199 words from StopWords_Geographic.txt
Reading file: StopWords_Names.txt
Loaded 13014 words from StopWords_Names.txt
Total stop words loaded: 12768
Processed and saved: Netclan20241017.txt
Processed and saved: Netclan20241018.txt
Processed and saved: Netclan20241019.txt
Processed and saved: Netclan20241020.txt
Processed and saved: Netclan20241021.txt
Processed and saved: Netclan20241022.txt
Processed and saved: Netclan20241023.txt
Processed and saved: Netclan20241024.txt
Processed and saved: Netclan20241025.txt
Processed

In [4]:

def load_master_dictionary(positive_file, negative_file):
    master_dictionary = set()  # Use a set for fast lookup
    for file_path in [positive_file, negative_file]:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            words = file.read().splitlines()
            master_dictionary.update(word.strip().lower() for word in words if word.strip())  # Remove empty lines
    return master_dictionary

def load_word_list(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return [word.strip().lower() for word in file.read().splitlines() if word.strip()]  # Remove empty lines

positive_file = "MasterDictionary/MasterDictionary/positive-words.txt"  
negative_file = "MasterDictionary/MasterDictionary/negative-words.txt" 
master_dictionary = load_master_dictionary(positive_file, negative_file)
positive_word_list = load_word_list(positive_file)
negative_word_list = load_word_list(negative_file)
print(f"Loaded {len(positive_word_list)} positive words.")
print(f"Loaded {len(negative_word_list)} negative words.")
print(f"Combined dictionary has {len(master_dictionary)} words.")


Loaded 2006 positive words.
Loaded 4783 negative words.
Combined dictionary has 6786 words.


In [29]:
import os

def process_cleaned_files_with_lists(cleaned_folder, positive_words, negative_words, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file_name in os.listdir(cleaned_folder):
        if file_name.endswith('.txt'):
            file_path = os.path.join(cleaned_folder, file_name)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                cleaned_text = file.read().split()  
            positive_list = [word for word in cleaned_text if word in positive_words]
            negative_list = [word for word in cleaned_text if word in negative_words]
            base_name = os.path.splitext(file_name)[0]
            positive_output_file = os.path.join(output_folder, f"{base_name}_positive.txt")
            negative_output_file = os.path.join(output_folder, f"{base_name}_negative.txt")
            
            with open(positive_output_file, 'w', encoding='utf-8') as pos_file:
                pos_file.write('\n'.join(positive_list))
            
            with open(negative_output_file, 'w', encoding='utf-8') as neg_file:
                neg_file.write('\n'.join(negative_list))
            
            print(f"Processed {file_name}: {len(positive_list)} positive words, {len(negative_list)} negative words.")


cleaned_folder = "CleanedTextFiles" 
output_folder = "Processed_files"   

process_cleaned_files_with_lists(cleaned_folder, positive_word_list, negative_word_list, output_folder)


Processed Netclan20241017.txt: 8 positive words, 1 negative words.
Processed Netclan20241018.txt: 15 positive words, 8 negative words.
Processed Netclan20241019.txt: 13 positive words, 3 negative words.
Processed Netclan20241020.txt: 26 positive words, 12 negative words.
Processed Netclan20241021.txt: 5 positive words, 1 negative words.
Processed Netclan20241022.txt: 8 positive words, 1 negative words.
Processed Netclan20241023.txt: 15 positive words, 8 negative words.
Processed Netclan20241024.txt: 13 positive words, 3 negative words.
Processed Netclan20241025.txt: 26 positive words, 12 negative words.
Processed Netclan20241026.txt: 59 positive words, 22 negative words.
Processed Netclan20241027.txt: 21 positive words, 7 negative words.
Processed Netclan20241028.txt: 30 positive words, 3 negative words.
Processed Netclan20241029.txt: 23 positive words, 6 negative words.
Processed Netclan20241030.txt: 16 positive words, 6 negative words.
Processed Netclan20241031.txt: 64 positive words

In [5]:
import os
import pandas as pd

def calculate_scores_from_file(file_path, positive_words, negative_words):
    # Read the file content
    with open(file_path, 'r', encoding='utf-8') as file:
        words = file.read().split()
    
    # Calculate scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = -sum(1 for word in words if word in negative_words)  # Keep negative score as negative
    
    return positive_score, negative_score

def get_total_words_after_cleaning(cleaned_folder, base_name):
    # Locate the corresponding cleaned file
    cleaned_file_path = os.path.join(cleaned_folder, f"{base_name}.txt")
    if not os.path.exists(cleaned_file_path):
        return 0  # Return 0 if cleaned file is missing
    
    # Count total words in the cleaned file
    with open(cleaned_file_path, 'r', encoding='utf-8') as file:
        words = file.read().split()
    return len(words)

def process_combined_scores(input_folder, cleaned_folder, positive_words, negative_words):
    # Dictionary to store combined results
    scores_dict = {}

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.txt'):  # Process only text files
            file_path = os.path.join(input_folder, file_name)
            
            # Determine the base name (remove "_positive" or "_negative")
            if "_positive" in file_name:
                base_name = file_name.replace("_positive.txt", "")
            elif "_negative" in file_name:
                base_name = file_name.replace("_negative.txt", "")
            else:
                continue
            
            # Calculate positive and negative scores
            positive_score, negative_score = calculate_scores_from_file(file_path, positive_words, negative_words)
            
            # Add or update the scores in the dictionary
            if base_name not in scores_dict:
                scores_dict[base_name] = {
                    "Positive Score": 0, 
                    "Negative Score": 0
                }
            scores_dict[base_name]["Positive Score"] += positive_score
            scores_dict[base_name]["Negative Score"] += negative_score

    # Create the DataFrame
    data = []
    for base_name, scores in scores_dict.items():
        positive_score = scores["Positive Score"]
        negative_score = scores["Negative Score"]
        
        # Get total words from the cleaned files
        total_words_after_cleaning = get_total_words_after_cleaning(cleaned_folder, base_name)
        
        # Calculate Polarity Score and Subjectivity Score
        polarity_score = (positive_score - abs(negative_score)) / ((positive_score + abs(negative_score)) + 0.000001)
        subjectivity_score = (positive_score + abs(negative_score)) / (total_words_after_cleaning + 0.000001)
        
        # Append the data
        data.append({
            "File Name": base_name,
            "Positive Score": positive_score,
            "Negative Score": negative_score,
            "Polarity Score": polarity_score,
            "Subjectivity Score": subjectivity_score,
            "Total Words After Cleaning": total_words_after_cleaning
        })
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

     # Replace with your negative words list
input_folder = "Processed_files"                    # Folder containing files with positive/negative words
cleaned_folder = "Cleanedtextfiles"                       # Folder containing cleaned text files

# Generate the combined DataFrame
df_combined_scores = process_combined_scores(input_folder, cleaned_folder, positive_word_list, negative_word_list)

# Save to a CSV file for reference
df_combined_scores.to_csv("combined_scores_summary.csv", index=False)

# Display the DataFrame
df_combined_scores


Unnamed: 0,File Name,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Total Words After Cleaning
0,Netclan20241017,8,-1,0.777778,0.035433,254
1,Netclan20241018,15,-8,0.304348,0.044146,521
2,Netclan20241019,13,-3,0.625000,0.054795,292
3,Netclan20241020,26,-12,0.368421,0.074803,508
4,Netclan20241021,5,-1,0.666667,0.023529,255
...,...,...,...,...,...,...
142,Netclan20241159,15,-10,0.200000,0.038168,655
143,Netclan20241160,23,-18,0.121951,0.045759,896
144,Netclan20241161,9,-7,0.125000,0.066667,240
145,Netclan20241162,3,-1,0.500000,0.027972,143


In [12]:
pip install spacy


Collecting spacy
  Downloading spacy-3.8.3-cp39-cp39-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp39-cp39-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp39-cp39-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp39-cp39-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.3-cp39-cp39-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.0-cp39-cp39-win_amd64.whl.metadat

In [15]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 5.6 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 6.9 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 8.8 MB/s eta 0:00:01
     ------------------------------- ------- 10.2/12.8 MB 13.0 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 12.4 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [17]:
import os
import re
import pandas as pd
import spacy
from collections import Counter

# Load SpaCy language model
nlp = spacy.load("en_core_web_sm")

def count_syllables(word):
    """Count the number of syllables in a word."""
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    if word.endswith("e"):
        word = word[:-1]  # Silent 'e'
    for index, char in enumerate(word):
        if char in vowels and (index == 0 or word[index - 1] not in vowels):
            count += 1
    # Handle exceptions for "es" and "ed" endings
    if word.endswith(("es", "ed")) and len(word) > 2:
        count = max(1, count - 1)
    return max(1, count)  # Ensure at least one syllable

def analyze_text(text):
    """Perform readability analysis on the given text."""
    doc = nlp(text)
    sentences = list(doc.sents)
    words = [token.text for token in doc if token.is_alpha]
    word_count = len(words)
    sentence_count = len(sentences)

    if sentence_count == 0:  # Handle edge case
        return None
    
    # Average Sentence Length
    avg_sentence_length = word_count / sentence_count

    # Syllable Count Per Word
    syllable_counts = [count_syllables(word) for word in words]
    avg_syllables_per_word = sum(syllable_counts) / len(syllable_counts)

    # Complex Word Count
    complex_words = [word for word, syllables in zip(words, syllable_counts) if syllables > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = complex_word_count / word_count

    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words * 100)

    # Average Words Per Sentence
    avg_words_per_sentence = word_count / sentence_count

    # Personal Pronouns
    pronouns = ["i", "we", "my", "ours", "us"]
    personal_pronouns = sum(1 for word in words if word.lower() in pronouns)

    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / word_count

    return {
        "Average Sentence Length": avg_sentence_length,
        "Percentage of Complex Words": percentage_complex_words * 100,
        "Fog Index": fog_index,
        "Average Words Per Sentence": avg_words_per_sentence,
        "Complex Word Count": complex_word_count,
        "Personal Pronouns": personal_pronouns,
        "Average Word Length": avg_word_length,
        "Avg Syllables Per Word": avg_syllables_per_word,
        "Syllable Counts": syllable_counts,  # Added syllable counts for each word
    }

def analyze_folder(folder_path):
    """Analyze all text files in the folder and create a DataFrame."""
    results = []
    file_names = []
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith(".txt"):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                analysis = analyze_text(text)
                if analysis:
                    # Add syllable counts per word as a separate column
                    syllable_counts_str = ", ".join(map(str, analysis.pop("Syllable Counts")))
                    analysis["Syllable Counts Per Word"] = syllable_counts_str
                    results.append(analysis)
                    file_names.append(file_name)
    
    # Create DataFrame
    df = pd.DataFrame(results, index=file_names)
    return df




# Folder path containing text files
folder_path = "articles"

# Analyze folder and display results
df = analyze_folder(folder_path)

# Save to CSV if needed
df.to_csv("readability_analysis.csv")
df


Unnamed: 0,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Words Per Sentence,Complex Word Count,Personal Pronouns,Average Word Length,Avg Syllables Per Word,Syllable Counts Per Word
Netclan20241017.txt,32.727273,26.388889,23.646465,32.727273,95,2,5.936111,1.941667,"1, 1, 1, 1, 2, 4, 1, 2, 2, 1, 1, 4, 3, 4, 1, 2..."
Netclan20241018.txt,27.105263,17.184466,17.715892,27.105263,177,8,5.053398,1.657282,"3, 1, 1, 2, 1, 5, 1, 2, 2, 3, 1, 2, 4, 1, 2, 3..."
Netclan20241019.txt,31.076923,22.029703,21.242650,31.076923,89,2,5.784653,1.866337,"1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 2, 4..."
Netclan20241020.txt,23.766667,33.239832,22.802599,23.766667,237,5,6.375877,2.107994,"3, 3, 1, 4, 1, 3, 2, 1, 1, 1, 3, 3, 4, 1, 3, 2..."
Netclan20241021.txt,26.142857,24.863388,20.402498,26.142857,91,2,5.765027,1.923497,"4, 1, 1, 2, 1, 3, 2, 3, 2, 4, 2, 2, 1, 2, 3, 3..."
...,...,...,...,...,...,...,...,...,...
Netclan20241159.txt,22.790698,21.938776,17.891789,22.790698,215,4,5.497959,1.845918,"4, 1, 4, 2, 1, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3..."
Netclan20241160.txt,22.530303,16.879623,15.763971,22.530303,251,13,5.255548,1.715535,"1, 1, 2, 2, 4, 1, 3, 3, 2, 4, 2, 2, 1, 2, 3, 3..."
Netclan20241161.txt,24.176471,18.248175,16.969858,24.176471,75,12,5.284672,1.749392,"2, 2, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3, 1, 3, 2..."
Netclan20241162.txt,41.800000,28.708134,28.203254,41.800000,60,1,6.325359,2.110048,"2, 1, 1, 2, 2, 2, 1, 3, 2, 4, 2, 2, 1, 2, 3, 3..."


In [28]:
result = pd.concat([df_combined_scores.reset_index(drop=True), df.reset_index(drop=True)], axis=1)

In [29]:
result.

Unnamed: 0,File Name,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Total Words After Cleaning,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Words Per Sentence,Complex Word Count,Personal Pronouns,Average Word Length,Avg Syllables Per Word,Syllable Counts Per Word
0,Netclan20241017,8,-1,0.777778,0.035433,254,32.727273,26.388889,23.646465,32.727273,95,2,5.936111,1.941667,"1, 1, 1, 1, 2, 4, 1, 2, 2, 1, 1, 4, 3, 4, 1, 2..."
1,Netclan20241018,15,-8,0.304348,0.044146,521,27.105263,17.184466,17.715892,27.105263,177,8,5.053398,1.657282,"3, 1, 1, 2, 1, 5, 1, 2, 2, 3, 1, 2, 4, 1, 2, 3..."
2,Netclan20241019,13,-3,0.625000,0.054795,292,31.076923,22.029703,21.242650,31.076923,89,2,5.784653,1.866337,"1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 2, 4..."
3,Netclan20241020,26,-12,0.368421,0.074803,508,23.766667,33.239832,22.802599,23.766667,237,5,6.375877,2.107994,"3, 3, 1, 4, 1, 3, 2, 1, 1, 1, 3, 3, 4, 1, 3, 2..."
4,Netclan20241021,5,-1,0.666667,0.023529,255,26.142857,24.863388,20.402498,26.142857,91,2,5.765027,1.923497,"4, 1, 1, 2, 1, 3, 2, 3, 2, 4, 2, 2, 1, 2, 3, 3..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,15,-10,0.200000,0.038168,655,22.790698,21.938776,17.891789,22.790698,215,4,5.497959,1.845918,"4, 1, 4, 2, 1, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3..."
143,Netclan20241160,23,-18,0.121951,0.045759,896,22.530303,16.879623,15.763971,22.530303,251,13,5.255548,1.715535,"1, 1, 2, 2, 4, 1, 3, 3, 2, 4, 2, 2, 1, 2, 3, 3..."
144,Netclan20241161,9,-7,0.125000,0.066667,240,24.176471,18.248175,16.969858,24.176471,75,12,5.284672,1.749392,"2, 2, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3, 1, 3, 2..."
145,Netclan20241162,3,-1,0.500000,0.027972,143,41.800000,28.708134,28.203254,41.800000,60,1,6.325359,2.110048,"2, 1, 1, 2, 2, 2, 1, 3, 2, 4, 2, 2, 1, 2, 3, 3..."


In [31]:
result.rename(columns={"Total Words After Cleaning": "Word Count"}, inplace=True)

In [32]:
result

Unnamed: 0,File Name,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Word Count,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Words Per Sentence,Complex Word Count,Personal Pronouns,Average Word Length,Avg Syllables Per Word,Syllable Counts Per Word
0,Netclan20241017,8,-1,0.777778,0.035433,254,32.727273,26.388889,23.646465,32.727273,95,2,5.936111,1.941667,"1, 1, 1, 1, 2, 4, 1, 2, 2, 1, 1, 4, 3, 4, 1, 2..."
1,Netclan20241018,15,-8,0.304348,0.044146,521,27.105263,17.184466,17.715892,27.105263,177,8,5.053398,1.657282,"3, 1, 1, 2, 1, 5, 1, 2, 2, 3, 1, 2, 4, 1, 2, 3..."
2,Netclan20241019,13,-3,0.625000,0.054795,292,31.076923,22.029703,21.242650,31.076923,89,2,5.784653,1.866337,"1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 2, 4..."
3,Netclan20241020,26,-12,0.368421,0.074803,508,23.766667,33.239832,22.802599,23.766667,237,5,6.375877,2.107994,"3, 3, 1, 4, 1, 3, 2, 1, 1, 1, 3, 3, 4, 1, 3, 2..."
4,Netclan20241021,5,-1,0.666667,0.023529,255,26.142857,24.863388,20.402498,26.142857,91,2,5.765027,1.923497,"4, 1, 1, 2, 1, 3, 2, 3, 2, 4, 2, 2, 1, 2, 3, 3..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,15,-10,0.200000,0.038168,655,22.790698,21.938776,17.891789,22.790698,215,4,5.497959,1.845918,"4, 1, 4, 2, 1, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3..."
143,Netclan20241160,23,-18,0.121951,0.045759,896,22.530303,16.879623,15.763971,22.530303,251,13,5.255548,1.715535,"1, 1, 2, 2, 4, 1, 3, 3, 2, 4, 2, 2, 1, 2, 3, 3..."
144,Netclan20241161,9,-7,0.125000,0.066667,240,24.176471,18.248175,16.969858,24.176471,75,12,5.284672,1.749392,"2, 2, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3, 1, 3, 2..."
145,Netclan20241162,3,-1,0.500000,0.027972,143,41.800000,28.708134,28.203254,41.800000,60,1,6.325359,2.110048,"2, 1, 1, 2, 2, 2, 1, 3, 2, 4, 2, 2, 1, 2, 3, 3..."


In [33]:
import pandas as pd
file_path = "Input.xlsx - Sheet1.csv"
df = pd.read_csv(file_path)
result["URL"]=df["URL"]

In [34]:
result

Unnamed: 0,File Name,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Word Count,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Words Per Sentence,Complex Word Count,Personal Pronouns,Average Word Length,Avg Syllables Per Word,Syllable Counts Per Word,URL
0,Netclan20241017,8,-1,0.777778,0.035433,254,32.727273,26.388889,23.646465,32.727273,95,2,5.936111,1.941667,"1, 1, 1, 1, 2, 4, 1, 2, 2, 1, 1, 4, 3, 4, 1, 2...",https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,15,-8,0.304348,0.044146,521,27.105263,17.184466,17.715892,27.105263,177,8,5.053398,1.657282,"3, 1, 1, 2, 1, 5, 1, 2, 2, 3, 1, 2, 4, 1, 2, 3...",https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,13,-3,0.625000,0.054795,292,31.076923,22.029703,21.242650,31.076923,89,2,5.784653,1.866337,"1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 2, 4...",https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,26,-12,0.368421,0.074803,508,23.766667,33.239832,22.802599,23.766667,237,5,6.375877,2.107994,"3, 3, 1, 4, 1, 3, 2, 1, 1, 1, 3, 3, 4, 1, 3, 2...",https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,5,-1,0.666667,0.023529,255,26.142857,24.863388,20.402498,26.142857,91,2,5.765027,1.923497,"4, 1, 1, 2, 1, 3, 2, 3, 2, 4, 2, 2, 1, 2, 3, 3...",https://insights.blackcoffer.com/development-o...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,15,-10,0.200000,0.038168,655,22.790698,21.938776,17.891789,22.790698,215,4,5.497959,1.845918,"4, 1, 4, 2, 1, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3...",https://insights.blackcoffer.com/population-an...
143,Netclan20241160,23,-18,0.121951,0.045759,896,22.530303,16.879623,15.763971,22.530303,251,13,5.255548,1.715535,"1, 1, 2, 2, 4, 1, 3, 3, 2, 4, 2, 2, 1, 2, 3, 3...",https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,9,-7,0.125000,0.066667,240,24.176471,18.248175,16.969858,24.176471,75,12,5.284672,1.749392,"2, 2, 4, 3, 2, 4, 2, 2, 1, 2, 3, 3, 3, 1, 3, 2...",https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,3,-1,0.500000,0.027972,143,41.800000,28.708134,28.203254,41.800000,60,1,6.325359,2.110048,"2, 1, 1, 2, 2, 2, 1, 3, 2, 4, 2, 2, 1, 2, 3, 3...",https://insights.blackcoffer.com/budget-sales-...


In [35]:
df.to_csv("Output Data STructure.csv")