<a href="https://colab.research.google.com/github/VaishnaviBairagoni/NLP-LAB/blob/main/NLP-F-22-08-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
#Task 1:Dataset Loading and cleaning
import pandas as pd
import re
import spacy

# Load the dataset (Assuming the file is named 'train.csv')
df = pd.read_csv('/ai_human_content_detection_dataset.csv')

# Inspect the structure
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns)

# Display first 5 entries of ''
print("\nFirst 5 text_content entries:\n", df['text_content'].head())

# Clean null values
df = df.dropna(subset=['text_content'])
print("\nNull values after cleaning:", df['text_content'].isnull().sum())


Dataset shape: (1367, 17)

Columns: Index(['text_content', 'content_type', 'word_count', 'character_count',
       'sentence_count', 'lexical_diversity', 'avg_sentence_length',
       'avg_word_length', 'punctuation_ratio', 'flesch_reading_ease',
       'gunning_fog_index', 'grammar_errors', 'passive_voice_ratio',
       'predictability_score', 'burstiness', 'sentiment_score', 'label'],
      dtype='object')

First 5 text_content entries:
 0    Score each cause. Quality throughout beautiful...
1    Board its rock. Job worker break tonight coupl...
2    Way debate decision produce. Dream necessary c...
3    Story turn because such during open model. Tha...
4    Place specific as simply leader fall analysis....
Name: text_content, dtype: object

Null values after cleaning: 0


In [9]:
#Task-2:POS tagging with spaCy
!python -m spacy download en_core_web_sm


# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Take first 5 entries from 'description_x'
sentences = df['description_x'].head(5).tolist()

for idx, sentence in enumerate(sentences, 1):
    print(f"\nSentence {idx}: {sentence}")
    doc = nlp(sentence)

    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    adjectives = [token.text for token in doc if token.pos_ == "ADJ"]

    print("Nouns:", nouns)
    print("Verbs:", verbs)
    print("Adjectives:", adjectives)

# Q2: Regex Cleaning - Remove phone, email, URLs, special chars
texts = [
    "My phone number is 1234567890 and my email is test@domain.com",
    "Visit https://example.com for more info!!!",
    "HELLO!!! This is SOOOOO exciting :))",
    "Contact us at info@company.org or call +91 98765-43210",
    "Python's regex is very useful!!!  #Coding #Fun"
]

cleaned_texts = []

for text in texts:
    # Remove phone numbers
    text = re.sub(r'\+?\d[\d -]{8,}\d', '', text)

    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Remove special characters (except spaces and letters)
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    cleaned_texts.append(text)

print("\nCleaned Texts:")
for ct in cleaned_texts:
    print(ct)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

Sentence 1: semtech corp
Nouns: []
Verbs: []
Adjectives: []

Sentence 2: vanguard mid cap index
Nouns: ['index']
Verbs: []
Adjectives: []

Sentence 3: spdr gold trust gold shares
Nouns: ['spdr', 'gold', 'trust', 'gold', 'shares']
Verbs: []
Adjectives: []

Sentence 4: vanguard total bond index adm
Nouns: ['index']
Verbs: []
Adjective