In [36]:
#loading and reading data
import pandas as pd

df = pd.read_csv('/content/output.csv')

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       1 non-null      object
 1   Paragraphs  1 non-null      object
 2   Headings    1 non-null      object
 3   Links       1 non-null      object
 4   Images      1 non-null      object
dtypes: object(5)
memory usage: 168.0+ bytes
None
                                               Title  \
0  Iran-Israel war: How it may impact investors -...   

                                          Paragraphs  \
0  Stock Trading,Macroeconomics Made Easy: Online...   

                                            Headings  \
0  Iran-Israel war: How it may impact investors,T...   

                                               Links  \
0  https://economictimes.indiatimes.com/markets/i...   

                                              Images  
0  https://img.etimg.com/photo/msid-76920425,qual...  


In [37]:
#Handling Missing values and Standardising Features-----------------

print("Missing values before handling:\n", df.isnull().sum())

# Filling missing values of Title, Paragraphs and Headings with empty strings "".
df['Title'].fillna("", inplace=True)
df['Paragraphs'].fillna("", inplace=True)
df['Headings'].fillna("", inplace=True)

# Missing URLs and images are filled with "No Link" and "No Image".
df['Links'].fillna("No Link", inplace=True)
df['Images'].fillna("No Image", inplace=True)

print("Missing values after handling:\n", df.isnull().sum())


Missing values before handling:
 Title         0
Paragraphs    0
Headings      0
Links         0
Images        0
dtype: int64
Missing values after handling:
 Title         0
Paragraphs    0
Headings      0
Links         0
Images        0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Title'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Paragraphs'].fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves

In [38]:
import re

# Converting all texts in to lowercase for text consistency
df['Title'] = df['Title'].str.lower()
df['Paragraphs'] = df['Paragraphs'].str.lower()
df['Headings'] = df['Headings'].str.lower()

# Removing any punctuations or unwanted characters from Paragraphs and Headings.
df['Paragraphs'] = df['Paragraphs'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Headings'] = df['Headings'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Cleaning URLs (example: removing unwanted query strings)
def clean_url(url):
    return re.sub(r'\?.*', '', url) if isinstance(url, str) else url

df['Links'] = df['Links'].apply(clean_url)
df['Images'] = df['Images'].apply(clean_url)


print(df.head())



                                               Title  \
0  iran-israel war: how it may impact investors -...   

                                          Paragraphs  \
0  stock tradingmacroeconomics made easy online c...   

                                            Headings  \
0  iranisrael war how it may impact investorsthe ...   

                                               Links  \
0  https://economictimes.indiatimes.com/markets/i...   

                                              Images  
0  https://img.etimg.com/photo/msid-76920425,qual...  


In [39]:
# Checking  word count for each paragraph
df['Paragraph_Length'] = df['Paragraphs'].apply(lambda x: len(x.split()))  # Add a column named Paragraph_Lengths and it will store word count for each paragraph

# Displaying paragraph lengths to understand the distribution
print(df[['Paragraphs', 'Paragraph_Length']].head(20))  # Print first 20 rows



                                          Paragraphs  Paragraph_Length
0  stock tradingmacroeconomics made easy online c...               535


In [40]:
#Removing Outliers------------
# Removing rows based on the paragraph length, like remove paragraph with length  is too short or too long
df = df[df['Paragraphs'].apply(lambda x: 50 < len(x.split()) < 3000)]  # Adjusted thresholds

#Checking and displaying the remaining data
print("Remaining rows after removing outliers:", len(df))
print(df.head())



Remaining rows after removing outliers: 1
                                               Title  \
0  iran-israel war: how it may impact investors -...   

                                          Paragraphs  \
0  stock tradingmacroeconomics made easy online c...   

                                            Headings  \
0  iranisrael war how it may impact investorsthe ...   

                                               Links  \
0  https://economictimes.indiatimes.com/markets/i...   

                                              Images  Paragraph_Length  
0  https://img.etimg.com/photo/msid-76920425,qual...               535  


In [54]:
# lemmatization----------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import spacy

nlp = spacy.load("en_core_web_sm") # Loading spaCy's English model

# Apply lemmatization to the 'Paragraphs' column
df['Paragraphs_lemmatized'] = df['Paragraphs'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))


print(df[['Paragraphs', 'Paragraphs_lemmatized']].head())



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                          Paragraphs  \
0  stock tradingmacroeconomics made easy online c...   

                               Paragraphs_lemmatized  
0  stock tradingmacroeconomic make easy online ce...  


In [55]:
#Tokenization and Stop word removal--------------

nltk.download('stopwords')
from nltk.corpus import stopwords

# Defining the list of stopwords
stop_words = set(stopwords.words('english'))

# Removing stopwords from the 'Paragraphs' column
df['Paragraphs_clean'] = df['Paragraphs_lemmatized'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

print(df[['Paragraphs_lemmatized', 'Paragraphs_clean']].head())



                               Paragraphs_lemmatized  \
0  stock tradingmacroeconomic make easy online ce...   

                                    Paragraphs_clean  
0  stock tradingmacroeconomic make easy online ce...  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
#Task 3----------

from collections import Counter

text_combined = ' '.join(df['Paragraphs_clean'])

# Split into words and count frequency
word_counts = Counter(text_combined.split()) # Returns a dictionary like object where keys are words and values are their counts

# Getting top 10 the most common words
most_common_words = word_counts.most_common(10)
print("Most common words:", most_common_words)


Most common words: [('make', 14), ('courseby', 12), ('easy', 11), ('trading', 10), ('stock', 9), ('prime', 8), ('financial', 7), ('1', 7), ('saraf', 6), ('et', 6)]


In [45]:
from transformers import pipeline

# Load the summarization model(pre-trained model from HuggingFace from summarization tasks).
summarizer = pipeline("summarization")

# Function to summarize text
def summarize_text(text):
    truncated_text = ' '.join(text.split()[:500])
    summary = summarizer(truncated_text, max_length=130, min_length=30, do_sample=False) #model try to keep summary shorter than 130 and longer than 30 of length
    return summary[0]['summary_text']

# Applying summarization function to each paragraph
df['Summary'] = df['Paragraphs_clean'].apply(summarize_text)


print(df[['Paragraphs_clean', 'Summary']].head())


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


                                    Paragraphs_clean  \
0  stock tradingmacroeconomic make easy online ce...   

                                             Summary  
0   Stock tradingmacroeconomic make easy online c...  


In [46]:
#Sentiment Analysis------
from transformers import pipeline

# Loading emotion analysis model from Hugging Face
emotion_analyzer = pipeline("text-classification", model="mrm8488/t5-base-finetuned-emotion")

# Defining label mapping
label_mapping = {
    "LABEL_0": "joy",
    "LABEL_1": "sadness",
    "LABEL_2": "anger",
    "LABEL_3": "fear",
    "LABEL_4": "surprise",
    "LABEL_5": "neutral"
}

# Analyze the emotion for the text
def analyze_emotion(text):

    truncated_text = ' '.join(text.split()[:500])
    results = emotion_analyzer(truncated_text)

    # Mapping labels to human-readable emotions
    mapped_results = [(label_mapping[result['label']], result['score']) for result in results]
    return mapped_results

# Apply emotion analysis to each paragraph
df['Emotion'] = df['Paragraphs_clean'].apply(analyze_emotion)


df[['Paragraphs', 'Emotion']].head()


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mrm8488/t5-base-finetuned-emotion and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (786 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,Paragraphs,Emotion
0,stock tradingmacroeconomics made easy online c...,"[(sadness, 0.8176189661026001)]"
