In [None]:
!pip install Newspaper3k

In [None]:

import pandas as pd
from newspaper import Article

df = pd.read_excel("/content/Input.xlsx")
urls = df["URL"].tolist()
url_ids = df["URL_ID"].tolist()

data = []
for url, url_id in zip(urls, url_ids):
    article = Article(url)

    try:
        article.download()
        article.parse()

        title = article.title
        content = article.text

        data.append({"url_id": url_id, "url": url, "title": title, "content": content})

    except Exception as e:
        print(f"Error processing URL: {url}\n{e}")

df = pd.DataFrame(data)
print(df)

In [None]:
print(df['url_id'].to_string())

In [None]:
for index, row in df.iterrows():
    url_id = str(row["url_id"])
    title = row["title"]
    content = row["content"]

    # Creating the text file with title and content
    with open(f"{url_id}.txt", "w", encoding="utf-8") as f:
        f.write(f"Title:\n{title}\n\nContent:\n{content}")

In [None]:
df["analysis"] = df["title"] + "\n\n" + df["content"]


In [None]:
print(df)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Loading stop words from NLTK
stop_words = stopwords.words("english")

# Loading the stop words
with open("/content/StopWords.txt", "r") as f:
    custom_stop_words = set(f.read().splitlines())
    stop_words.extend(custom_stop_words)

In [None]:
# Define a function to clean text
def clean_text(text):
    cleaned_text = []
    for word in text.lower().split():
        if word not in stop_words:
            cleaned_text.append(word)
    return " ".join(cleaned_text)

# Applying the cleaning function to the 'analysis' column
df['analysis'] = df['analysis'].apply(clean_text)

print(df)

In [None]:
import string

punctuation = string.punctuation + "’"

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punctuation))

# Apply the function
df['analysis'] = df['analysis'].apply(remove_punctuation)

print(df)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('words')

# Loading positive and negative dictionaries
positive_words = set(nltk.corpus.words.words("/content/positive_words.txt"))
with open("/content/negative_words.txt", encoding="utf-8") as f:
    negative_words = set(f.read().splitlines())


# Function to calculate sentiment scores
def calculate_sentiment_scores(text):
    tokens = nltk.word_tokenize(text)
    positive_score = sum(1 for token in tokens if token in positive_words)
    negative_score = -1 * sum(1 for token in tokens if token in negative_words)
    return positive_score, negative_score

# Applying the function to create new columns
results = df['analysis'].apply(calculate_sentiment_scores)
df[['positive_score', 'negative_score']] = pd.DataFrame(results.tolist(), index=df.index)


print(df)



In [None]:
# Calculating the total number of words in each row
df["total_words"] = df["analysis"].apply(lambda text: len(text.split()))


print(df)




In [None]:
df.isnull().sum()

In [None]:
print(df[['positive_score', 'negative_score', 'total_words']].shape)


In [None]:
# Function to calculate polarity_score
def calculate_polarity_score(positive_score, negative_score):
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)  # Avoid division by zero
    return polarity_score

# Creating the polarity_score column
df['polarity_score'] = df[['positive_score', 'negative_score']].apply(
    lambda row: calculate_polarity_score(row['positive_score'], row['negative_score']),
    axis=1
)

print(df)

In [None]:
df['negative_score'] = df['negative_score'].abs()
print(df)

In [None]:
# Function to calculate polarity_score
def calculate_polarity_score(positive_score, negative_score):
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)  # Avoid division by zero
    return polarity_score

# Creating the polarity_score column
df['polarity_score'] = df[['positive_score', 'negative_score']].apply(
    lambda row: calculate_polarity_score(row['positive_score'], row['negative_score']),
    axis=1
)

print(df)

In [None]:
# Function to calculate subjectivity_score with total_words
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    return subjectivity_score

# Creating the subjectivity_score column using both scores and total_words
df['subjectivity_score'] = df[['positive_score', 'negative_score', 'total_words']].apply(
    lambda row: calculate_subjectivity_score(row['positive_score'], row['negative_score'], row['total_words']),
    axis=1
)

print(df)

In [None]:
# Function to calculate average sentence length
def calculate_average_sentence_length(text):
    words = text.split()
    sentences = text.split('.')  # Assuming sentences end with periods
    return len(words) / len(sentences)

# Function to calculate average number of words per sentence
def calculate_average_words_per_sentence(total_words, total_sentences):
    return total_words / total_sentences

# Applying both functions to create new columns
df['average_sentence_length'] = df['content'].apply(calculate_average_sentence_length)
df['average_words_per_sentence'] = calculate_average_words_per_sentence(df['total_words'].sum(), len(df))

print(df)



In [None]:
from nltk.corpus import cmudict

# Function to count complex words
def count_complex_words(text):
    nltk.download('wordnet')
    nltk.download('cmudict')
    d = cmudict.dict()
    words = text.split()
    complex_words = 0
    for word in words:
        # Using cmudict syllabification, handling potential errors
        try:
            syllables = d[word.lower()][0]
            if len(syllables) > 2:
                complex_words += 1
        except (KeyError, IndexError):
            pass
    return complex_words

# Applying the function and addding the complex_words column
df['complex_words'] = df['analysis'].apply(count_complex_words)


print(df)



In [None]:
df['complex_words_percentage'] = df['complex_words'] / df['total_words']


In [None]:
# Calculating and addding fog_index column
df['fog_index'] = 0.4 * (df['average_sentence_length'] + df['complex_words_percentage'])


print(df)

In [None]:

def count_syllables(word):
  """
  This function counts the number of syllables in a word, handling exceptions
  for "es" and "ed" endings.
  """
  vowels = "aeiou"
  num_vowels = 0
  # Handling exceptions for "es" and "ed"
  if word.endswith("es") and len(word) > 2 and word[-2] not in vowels:
    num_vowels -= 1
  if word.endswith("ed") and len(word) > 3 and word[-3] not in vowels:
    num_vowels -= 1
  # Counting vowels considering exceptions
  for char in word:
    if char.lower() in vowels:
      num_vowels += 1
  # Syllable count based on vowel count
  if num_vowels == 0:
    return 1
  else:
    return num_vowels

# Applying the function to each word and summing the syllables
df['total_syllables'] = df['analysis'].apply(lambda text: sum(count_syllables(word) for word in text.split()))


print(df)



In [None]:
import re


pronouns = ["I", "we", "my", "ours", "us"]
pattern = r"(?<!\w)({})\b".format("|".join(pronouns))

# Applying regex and count occurrences
df['personal_pronouns'] = df['analysis'].apply(lambda text: len(re.findall(pattern, text, re.IGNORECASE)))

# Print or save the updated DataFrame
print(df)

In [None]:
# Calculating average word length based on character count in words
df['average_word_length'] = df['analysis'].apply(
    lambda text: sum(len(word) for word in text.split())
) / df['total_words']


print(df)



In [None]:
list(df.columns)


['url_id',
 'url',
 'title',
 'content',
 'analysis',
 'positive_score',
 'negative_score',
 'total_words',
 'polarity_score',
 'subjectivity_score',
 'average_sentence_length',
 'average_words_per_sentence',
 'complex_words',
 'complex_words_percentage',
 'fog_index',
 'total_syllables',
 'personal_pronouns',
 'average_word_length',
 'total_chars']

In [None]:
new_column_names = {
    "url_id": "URL_ID",
    "url": "URL",
    "positive_score": "POSITIVE SCORE",
    "negative_score": "NEGATIVE SCORE",
    "polarity_score": "POLARITY SCORE",
    "subjectivity_score": "SUBJECTIVITY SCORE",
    "average_sentence_length": "AVG SENTENCE LENGTH",
    "complex_words_percentage": "PERCENTAGE OF COMPLEX WORDS",
    "fog_index": "FOG INDEX",
    "average_words_per_sentence": "AVG NUMBER OF WORDS PER SENTENCE",
    "complex_words": "COMPLEX WORD COUNT",
    "total_words": "WORD COUNT",
    "total_syllables": "SYLLABLE PER WORD",
    "personal_pronouns": "PERSONAL PRONOUNS",
    "average_word_length": "AVG WORD LENGTH",

}

df.rename(columns=new_column_names, inplace=True)

In [None]:
print(df)

In [None]:

column_order = [
    "URL_ID",
    "URL",
    "POSITIVE SCORE",
    "NEGATIVE SCORE",
    "POLARITY SCORE",
    "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH",
    "PERCENTAGE OF COMPLEX WORDS",
    "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE",
    "COMPLEX WORD COUNT",
    "WORD COUNT",
    "SYLLABLE PER WORD",
    "PERSONAL PRONOUNS",
    "AVG WORD LENGTH",
]

# Selecting and export in the desired order, excluding unwanted columns
dataframe_to_export = df[
    [col for col in column_order if col not in ["title", "content", "analysis", "total_chars"]]
]
dataframe_to_export.to_excel("output.xlsx", index=False)

