In [1]:
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd

In [3]:
# Download NLTK data if not already done
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model for NER and lemmatization
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pradeesh11/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [9]:
# Load stopwords from NLTK
stop_words = set(stopwords.words('english'))
custom_stop_words = {'received', 'stating', 'fraudster', 'message', 'acting', 'police', 'job'}
stop_words.update(custom_stop_words)

In [10]:
def preprocess_text(text):
    # Check if the input is not a string (e.g., NaN or float), replace it with an empty string
    if not isinstance(text, str):
        text = ''
    
    # Step 1: Lowercase the text
    text = text.lower()
    
    # Step 2: Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Step 3: Tokenize the text
    words = text.split()
    
    # Step 4: Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Step 5: Apply stemming or lemmatization
    words = [stemmer.stem(word) for word in words]  # Using stemming
    # words = [lemmatizer.lemmatize(word) for word in words]  # Using lemmatization
    
    # Step 6: Optional NER
    doc = nlp(" ".join(words))
    entities = [ent.text for ent in doc.ents]
    
    # Rebuild the processed text
    processed_text = ' '.join(words)
    
    return processed_text, entities

In [11]:
# Load the CSV file (replace 'your_file.csv' with your actual file path)
file_path = 'train.csv'
df = pd.read_csv(file_path)
# Fill NaN values in the target column with an empty string
df['crimeaditionalinfo'] = df['crimeaditionalinfo'].fillna('')

# Check which column contains the text data (assuming 'crime_info' is the column name)
# Modify 'crime_info' to the actual column name in your CSV
column_name = 'crimeaditionalinfo'

# Apply the preprocessing function
df['processed_text'], df['extracted_entities'] = zip(*df[column_name].apply(preprocess_text))

# Save the processed data to a new CSV file if needed
output_file_path = 'processed_data.csv'
df.to_csv(output_file_path, index=False)

# Display the processed data
print("Processed Data:")
print(df[['processed_text', 'extracted_entities']])

Processed Data:
                                          processed_text  \
0      continu random call abus messag whatsapp someo...   
1      continu messag ask pay money send fake crop nu...   
2      like demand money ad section text messag reque...   
3      apna appli interview telecal resourc manag wro...   
4      call ladi send new phone vivo parcel post th f...   
...                                                  ...   
93681  ident theft smish sm fraud creditdebit card fr...   
93682  call number ask phone pay cash back offer requ...   
93683  cyber stalk blackmail phonesmsvoip call victim...   
93684  call kark bola ki aapka lotari laga ha aru ac ...   
93685  app name koko loan app send money account unkn...   

                                      extracted_entities  
0                                           [abus, issu]  
1                                                     []  
2                                                     []  
3      [appli interview tel

In [12]:
df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo,processed_text,extracted_entities
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,continu random call abus messag whatsapp someo...,"[abus, issu]"
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,continu messag ask pay money send fake crop nu...,[]
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,like demand money ad section text messag reque...,[]
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,apna appli interview telecal resourc manag wro...,"[appli interview telecal, twelv hundr charg, n..."
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,call ladi send new phone vivo parcel post th f...,[februari]
