### üìò 1. Data Description

In [None]:

import pandas as pd

# Load the dataset
df = pd.read_csv("/content/news.csv")

# Drop unnamed column if present
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

# Map labels to binary
df['label'] = df['label'].map({'FAKE': 0, 'REAL': 1})

# Description
print("Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nClass Distribution:\n", df['label'].value_counts())
print("\nSample Text:\n", df['text'].iloc[0])


### üßπ 2. Data Preprocessing

In [None]:

import re

# Clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.lower().strip()

df['clean_text'] = df['text'].apply(clean_text)


### üìä 3. Exploratory Data Analysis (EDA)

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x='label', data=df)
plt.xticks([0, 1], ['FAKE', 'REAL'])
plt.title('Label Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Text Length Distribution')
plt.xlabel('Number of Words')

plt.tight_layout()
plt.show()


### üßµ 4. Parsing the Text

In [None]:
print(df['clean_text'].sample(5, random_state=1))

### üîç 5. Text Mining

In [None]:
print(df[['clean_text', 'text_length']].head())

### üîé 6. str_subset() Equivalent

In [None]:

subset_df = df[df['clean_text'].str.contains("breaking", na=False)]
print(subset_df[['clean_text']].head())


### ‚úÖ 7. str_detect() Equivalent

In [None]:

df['contains_trump'] = df['clean_text'].str.contains("trump", na=False).astype(int)
print(df[['clean_text', 'contains_trump']].sample(5))


### üßµ 8. str_extract() Equivalent

In [None]:

df['first_word'] = df['clean_text'].str.extract(r'^(\w+)', expand=False)
print(df[['clean_text', 'first_word']].sample(5))


### ‚úçÔ∏è 9. Regular Expressions

In [None]:

sample_text = df['text'].iloc[10]
print("Sample Text:\n", sample_text)


In [None]:
re.findall(r"\b[A-Z][a-z]+\b", sample_text)

In [None]:
re.findall(r"(Breaking|Exclusive)", sample_text)

In [None]:
re.findall(r"[aeiou]{2,}", sample_text.lower())

In [None]:
re.findall(r"^Breaking", sample_text)

In [None]:
re.findall(r"\b(\w+)\s+\1\b", sample_text.lower())

### üî¢ 10. Logical Vector to Numerical Vector

In [None]:

df['urgent_flag'] = df['clean_text'].str.contains("urgent", na=False).astype(int)
print(df[['clean_text', 'urgent_flag']].sample(5))


### üòä 11. Sentiment Analysis

In [None]:

from textblob import TextBlob
df['sentiment_score'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
print(df[['clean_text', 'sentiment_score']].sample(5))


### üî° 12. Bi-gram and N-gram Extraction

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorizer = CountVectorizer(ngram_range=(2, 3), max_features=20)
X_ngrams = ngram_vectorizer.fit_transform(df['clean_text'])

print("Top N-grams:\n", ngram_vectorizer.get_feature_names_out())
