<a href="https://colab.research.google.com/github/Valmik2004/Infosys-Springboard-Internship/blob/main/Day_3_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()


Saving fake_job_postings.csv to fake_job_postings.csv


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data (only once)
nltk.download('stopwords')
nltk.download('wordnet')

# Load preprocessed dataset (from Day 3)
df = pd.read_csv("fake_job_postings.csv")

# -------------------------------------
# Check if 'clean_description' exists; if not, create it
# -------------------------------------
if 'clean_description' not in df.columns:
    print("'clean_description' not found. Creating now...")

    # Fill missing values in description column
    df['description'] = df['description'].fillna('')

    # Initialize tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Text cleaning function
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)           # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)        # Remove punctuation & numbers
        words = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
        return " ".join(words)

    # Apply cleaning
    df['clean_description'] = df['description'].apply(clean_text)
    print("Cleaned text column 'clean_description' created successfully.\n")
else:
    print("'clean_description' column found.\n")

# -------------------------------------
# Feature Extraction
# -------------------------------------

# Use the cleaned text column
texts = df['clean_description'].fillna('').tolist()

# 1. Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=2000)  # limit to top 2000 words
X_bow = bow_vectorizer.fit_transform(texts)

print("BoW shape:", X_bow.shape)
print("Sample feature names (BoW):", bow_vectorizer.get_feature_names_out()[:10])

# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf_vectorizer.fit_transform(texts)

print("\nTF-IDF shape:", X_tfidf.shape)
print("Sample feature names (TF-IDF):", tfidf_vectorizer.get_feature_names_out()[:10])

# 3. Compare sparsity and values
print("\nExample BoW vector (first row):")
print(X_bow[0].toarray())

print("\nExample TF-IDF vector (first row):")
print(X_tfidf[0].toarray())

print("\nColumns in DataFrame:")
print(df.columns)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


'clean_description' not found. Creating now...
Cleaned text column 'clean_description' created successfully.

BoW shape: (17880, 2000)
Sample feature names (BoW): ['ab' 'ability' 'able' 'abroad' 'academic' 'accept' 'access'
 'accommodation' 'accordance' 'according']

TF-IDF shape: (17880, 2000)
Sample feature names (TF-IDF): ['ab' 'ability' 'able' 'abroad' 'academic' 'accept' 'access'
 'accommodation' 'accordance' 'according']

Example BoW vector (first row):
[[0 0 0 ... 0 0 0]]

Example TF-IDF vector (first row):
[[0. 0. 0. ... 0. 0. 0.]]

Columns in DataFrame:
Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'clean_description'],
      dtype='object')


In [4]:
# Task 1:
# ‚óè Create both BoW and TF-IDF features from the company_profile column.
# ‚óè Compare their shapes and discuss which one captures meaning better.

#-------------------------------------------------------------------------------

# Task 1: Create BoW and TF-IDF features from company_profile

# Bag of Words representation
bow_vectorizer = CountVectorizer(max_features=2000)
X_bow = bow_vectorizer.fit_transform(texts)

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf_vectorizer.fit_transform(texts)

# Compare shapes
print(f"BoW shape   : {X_bow.shape}")
print(f"TF-IDF shape: {X_tfidf.shape}")

# Sample features
print("\nüîπ Sample feature names (BoW):", bow_vectorizer.get_feature_names_out()[:10])
print("üîπ Sample feature names (TF-IDF):", tfidf_vectorizer.get_feature_names_out()[:10])

# Discussion output
print("\n Discussion:")
print("Both BoW and TF-IDF have the same shape because both are limited to 2000 features.")
print("However, BoW only counts word frequency, while TF-IDF gives more importance to rare yet informative words,")
print("making TF-IDF generally better at capturing contextual meaning.")



BoW shape   : (17880, 2000)
TF-IDF shape: (17880, 2000)

üîπ Sample feature names (BoW): ['ab' 'ability' 'able' 'abroad' 'academic' 'accept' 'access'
 'accommodation' 'accordance' 'according']
üîπ Sample feature names (TF-IDF): ['ab' 'ability' 'able' 'abroad' 'academic' 'accept' 'access'
 'accommodation' 'accordance' 'according']

 Discussion:
Both BoW and TF-IDF have the same shape because both are limited to 2000 features.
However, BoW only counts word frequency, while TF-IDF gives more importance to rare yet informative words,
making TF-IDF generally better at capturing contextual meaning.


In [5]:
# Task 2:
# ‚óè Print top 20 most frequent words in job descriptions (using BoW).
#-------------------------------------------------------------------------------

import numpy as np
import pandas as pd

# Task 2: Top 20 most frequent words in company_profile (BoW)

# Convert BoW matrix to array
bow_array = X_bow.toarray()

# Sum all word counts across rows (to get total frequency of each term)
word_counts = np.sum(bow_array, axis=0)

# Create a DataFrame for easy sorting
bow_freq_df = pd.DataFrame({
    'word': bow_vectorizer.get_feature_names_out(),
    'frequency': word_counts
}).sort_values(by='frequency', ascending=False)

# Print top 20 most frequent words
print(bow_freq_df.head(20))



             word  frequency
1789         team      20009
437      customer      14834
1974         work      14830
1632      service      12377
334       company      11524
1387      product      11412
229      business      11407
293        client      10972
673    experience      10263
1187          new       9695
1581         sale       9323
1061      looking       8717
1405      project       8171
501   development       8066
1084   management       7245
1227  opportunity       6584
1099    marketing       6536
1768      support       6533
484        design       6357
974           job       6343
