In [None]:
from google.colab import files
uploaded = files.upload()

Saving fake_job_postings.csv to fake_job_postings.csv


In [None]:
# Day 2: Understanding and Loading the Dataset #125

import pandas as pd

# Load dataset (after downloading from Kaggle)
df = pd.read_csv('fake_job_postings.csv')

#  Display first few rows
print("Sample Data:")
print(df.head())

# Display basic info
print("\nDataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Check distribution of target variable
print("\nTarget (fraudulent) Distribution:")
print(df['fraudulent'].value_counts())
# Basic statistics
print("\nDataset Summary:")
print(df.describe(include='all'))

Sample Data:
   job_id                                      title            location  \
0       1                           Marketing Intern    US, NY, New York   
1       2  Customer Service - Cloud Video Production      NZ, , Auckland   
2       3    Commissioning Machinery Assistant (CMA)       US, IA, Wever   
3       4          Account Executive - Washington DC  US, DC, Washington   
4       5                        Bill Review Manager  US, FL, Fort Worth   

  department salary_range                                    company_profile  \
0  Marketing          NaN  We're Food52, and we've created a groundbreaki...   
1    Success          NaN  90 Seconds, the worlds Cloud Video Production ...   
2        NaN          NaN  Valor Services provides Workforce Solutions th...   
3      Sales          NaN  Our passion for improving quality of life thro...   
4        NaN          NaN  SpotSource Solutions LLC is a Global Human Cap...   

                                         descript

In [None]:
# Day 3: Text Cleaning and Preprocessing

import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset (same as Day 2)
df = pd.read_csv('fake_job_postings.csv')

# Define text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""

    # 1. Lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # 3. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # 4. Remove punctuation and numbers
    text = re.sub(r'[%s\d]' % re.escape(string.punctuation), ' ', text)

    # 5. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 6. Remove stopwords + Lemmatization
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    words = [
        lemmatizer.lemmatize(word)
        for word in text.split()
        if word not in stop_words
    ]

    return " ".join(words)

# Apply cleaning to only description column
df['clean_description'] = df['description'].apply(clean_text)

# Show before and after
print("Original Text:\n", df['description'].iloc[1][:300])
print("\nCleaned Text:\n", df['clean_description'].iloc[1][:300])

# Show a preview
print("\nExample of Cleaned Data:")
print(df[['description', 'clean_description']].head(3))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text:
 Organised - Focused - Vibrant - Awesome!Do you have a passion for customer service? Slick typing skills? Maybe Account Management? ...And think administration is cooler than a polar bear on a jetski? Then we need to hear you! We are the Cloud Video Production Service and opperating on a glodal level

Cleaned Text:
 organised focused vibrant awesome passion customer service slick typing skill maybe account management think administration cooler polar bear jetski need hear cloud video production service opperating glodal level yeah pretty cool serious delivering world class product excellent customer service rap

Example of Cleaned Data:
                                         description  \
0  Food52, a fast-growing, James Beard Award-winn...   
1  Organised - Focused - Vibrant - Awesome!Do you...   
2  Our client, located in Houston, is actively se...   

                                   clean_description  
0  food fast growing james beard award winning on...  
1  or

In [None]:
#Day 3, Task 1
##task 1
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords (run once)
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("fake_job_postings.csv")

# -------------------------------
# ✅ Task 1: Cleaning Function
# -------------------------------
def clean_company_profile(text):
    if pd.isnull(text):
        return ""

    # 1. Lowercase
    text = text.lower()

    # 2. Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # 3. Remove numbers
    text = re.sub(r'\d+', ' ', text)

    # 4. Remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)

    # 5. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 6. Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in text.split() if word not in stop_words]

    return " ".join(words)

# Apply cleaning
df["clean_company_profile"] = df["company_profile"].apply(clean_company_profile)

print(df[["company_profile", "clean_company_profile"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                     company_profile  \
0  We're Food52, and we've created a groundbreaki...   
1  90 Seconds, the worlds Cloud Video Production ...   
2  Valor Services provides Workforce Solutions th...   
3  Our passion for improving quality of life thro...   
4  SpotSource Solutions LLC is a Global Human Cap...   

                               clean_company_profile  
0  food created groundbreaking award winning cook...  
1  seconds worlds cloud video production service ...  
2  valor services provides workforce solutions me...  
3  passion improving quality life geography heart...  
4  spotsource solutions llc global human capital ...  


In [None]:
# ✅Day 3 Task 2: Word Count Analysis

# Word count before cleaning
df["word_count_before"] = df["description"].astype(str).apply(lambda x: len(x.split()))

# Clean description column using same function
df["clean_description"] = df["description"].apply(clean_company_profile)

# Word count after cleaning
df["word_count_after"] = df["clean_description"].apply(lambda x: len(x.split()))

# Calculate averages
avg_before = df["word_count_before"].mean()
avg_after = df["word_count_after"].mean()

print("Average words BEFORE cleaning:", avg_before)
print("Average words AFTER cleaning:", avg_after)

Average words BEFORE cleaning: 170.44602908277406
Average words AFTER cleaning: 116.08568232662192
