In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install pandas numpy google-generativeai



In [None]:
'''This Python script uses pandas to handle a dataset and 'google-generativeai' to fill in missing values using an AI model.
It configures the API key for the AI model and defines functions to generate content with the "gemini-pro" model and to clean the
dataset by filling in missing data. The script loads a CSV file, uses the AI model to suggest and fill missing values based on the
dataset's context, and saves the cleaned dataset as "cleaned_dataset.csv". The script also handles errors and retries if rate limits are exceeded.'''

In [None]:
import pandas as pd
import google.generativeai as genai
import time

# Configure the API key for the generative AI model
genai.configure(api_key="AIzaSyDklFWkA1SST6f-FxTx6Cf1UR0BJ3HYFjY")

# Function to generate content using the generative AI model
def generate_gemini_content(prompt):
    while True:
        try:
            model = genai.GenerativeModel("gemini-pro")
            response = model.generate_content(prompt)
            return response.text
        except Exception as e:
            print(f"An error occurred: {e}")
            print("Rate limit exceeded or another issue occurred, retrying after delay...")
            time.sleep(60)  # Sleep for 60 seconds before retrying

# Function to clean and fill missing values in a dataset
def clean_and_fill_missing_values(df):
    for column in df.columns:
        # Identify rows with missing values
        missing_indices = df[df[column].isnull()].index

        for index in missing_indices:
            # Create a prompt for the generative model based on the column and other values in the row
            prompt = f"Fill in the missing value for column '{column}' based on the following context:\n"
            context = df.dropna().to_dict(orient='records')
            context_string = '\n'.join([str(record) for record in context])
            prompt += f"Context:\n{context_string}\nMissing value location: {df.loc[index].to_dict()}\n"
            prompt += f"Suggested value for column '{column}':"

            # Get the suggested value from the generative AI model
            suggested_value = generate_gemini_content(prompt).strip()

            # Fill the missing value
            df.at[index, column] = suggested_value

    return df

# Example usage
if __name__ == "__main__":
    # Load your dataset
    file_path = "/content/drive/MyDrive/Hackathon/job_train.csv"  # Replace with your file path
    df = pd.read_csv(file_path)

    # Clean and fill missing values
    cleaned_df = clean_and_fill_missing_values(df)

    # Save the cleaned dataset
    cleaned_df.to_csv("cleaned_dataset.csv", index=False)
    print("Missing values have been filled and dataset saved as 'cleaned_dataset.csv'.")


In [None]:
'''This code snippet reads a CSV file named 'cleaned_dataset.csv' from a specified Google Drive path into a Pandas DataFrame ('df').
It then calculates the number of rows in the dataset using the 'len()' function and stores this count in the variable 'num_rows'.
Finally, it prints out the total number of rows in the dataset. This is useful for quickly assessing the size of the dataset you are working with.'''

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Dataset/cleaned_dataset.csv')
num_rows = len(df)

print(f"The number of rows in the dataset: {num_rows}")

The number of rows in the dataset: 8940


In [None]:
'''This Python script is designed to clean a dataset by removing rows that contain specific unwanted values, such as 'UNKNOWN' or 'NAME',
in any column. The process begins by reading the dataset from a CSV file ('cleaned_dataset.csv') stored in Google Drive into a Pandas
DataFrame ('df'). It then filters out rows where any column contains the values 'UNKNOWN' or 'NAME', resulting in a cleaned DataFrame ('df_cleaned').
The number of rows after this cleaning operation is calculated and printed, giving an indication of how much data was removed. Finally, the
cleaned DataFrame is saved as a new CSV file named 'cleaned_dataset1.csv', and a confirmation message is printed to indicate that the cleaning
process is complete and the cleaned dataset has been saved.'''

In [None]:
import pandas as pd

# Load your dataset
# Replace 'your_dataset.csv' with the actual path to your CSV file
df = pd.read_csv('/content/drive/MyDrive/Dataset/cleaned_dataset.csv')

# Removing rows with 'UNKNOWN' or 'NAME' in any column
df_cleaned = df[~df.isin(['UNKNOWN', 'NAME']).any(axis=1)]

# Print the cleaned DataFrame
print(df_cleaned)
num_rows_after = len(df_cleaned)
print(f"The number of rows after cleaning: {num_rows_after}")
# Save the cleaned DataFrame to a CSV file
df_cleaned.to_csv('cleaned_dataset.csv', index=False)

print("Cleaned dataset saved as 'cleaned_dataset1.csv'")


                                                  title             location  \
0                        Architect Middleware MQ Kuwait                KW KU   
2     Process Controls Staff Engineer Foxboro I A Tr...  US TX USA Southwest   
3     Experienced Telemarketer Wanted Digital Solutions               AU NSW   
4                               Senior Network Engineer        GB ENG London   
5     Energy Financial Reporter Low Carbon Energy In...       US NY New York   
...                                                 ...                  ...   
8934                          Data Manager Fixed Income       US NY New York   
8935                                  Financial Analyst         GR I Paiania   
8936               Customer Service Associate Part Time   CA ON Peterborough   
8937                                      Sales Manager       UA 61 Ternopil   
8939                               Sale Representatives                US NY   

                                       

In [None]:
'''This Python script is used to clean a dataset by removing specific patterns from text entries. It begins by loading a CSV file
('cleaned_dataset (2).csv') from a local directory into a Pandas DataFrame. A function is defined to identify and remove patterns
matching 'URL', 'EMAIL', or 'PHONE' followed by a 64-character hexadecimal string, which are typical for encoded or hashed data.
This function utilizes regular expressions for pattern matching. The script applies this cleaning function to every element of the
DataFrame, resulting in a cleaned version of the dataset. Finally, it saves the processed DataFrame to a new CSV file
('cleaned_dataset_no_specific_patterns.csv') in the same local directory. This ensures that sensitive or specific patterns are
removed from the dataset, making it more suitable for further analysis or sharing.'''

In [None]:
import pandas as pd
import re

# Load the CSV file
file_path = 'C:/Users/prati/Downloads/cleaned_dataset (2).csv'
df = pd.read_csv(file_path)

# Function to remove specific patterns (URL, EMAIL, PHONE) from a given text
def remove_specific_patterns(text):
    pattern = re.compile(r'(URL|EMAIL|PHONE)_[0-9a-fA-F]{64}')
    return pattern.sub(r'', str(text))

# Apply the function to all columns in the DataFrame
df_cleaned = df.applymap(remove_specific_patterns)

# Save the cleaned DataFrame to a new CSV file
output_file_path = 'C:/Users/prati/Downloads/cleaned_dataset_no_specific_patterns.csv'
df_cleaned.to_csv(output_file_path, index=False)

output_file_path

In [None]:
'''This Python script is used to preprocess a dataset for text analysis by combining information from multiple columns into a unified text
representation. It begins by loading a CSV file ('final.csv') into a Pandas DataFrame. The script then merges text from several specified
columns ('title', 'location', 'description', and 'requirements') into a new column named 'combined_text'. This merging involves filling any
missing values with empty strings and concatenating the text from each row into a single string. Following this, the script extracts the
combined text and the corresponding labels into separate lists. The labels are sourced from a column named 'fraudulent', which is intended
to indicate whether each entry is fraudulent. This preprocessing step prepares the dataset for advanced text analysis techniques, such as
feature extraction with TF-IDF or deep learning models like BERT.'''

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Dataset/final.csv')

# Combine text from relevant columns
text_columns = ['title', 'location', 'description', 'requirements']  # Adjust as needed
df['combined_text'] = df[text_columns].fillna('').astype(str).apply(lambda row: ' '.join(row), axis=1)

# Extract the combined text and labels
text_data = df['combined_text'].tolist()
labels = df['fraudulent'].tolist()  # Update to use the correct label column

In [None]:
'''This code snippet utilizes the 'TfidfVectorizer' from the 'sklearn.feature_extraction.text' module to transform the combined text
data into a numerical format suitable for machine learning models. It initializes the 'TfidfVectorizer' with a limit of 2000 features,
which means it will consider the top 2000 most important terms based on their term frequency-inverse document frequency (TF-IDF) scores.
The 'fit_transform' method is then applied to the 'text_data', which contains the combined text from the dataset. This method computes
the TF-IDF scores for each term in the text data and creates a sparse matrix ('tfidf_matrix') where each row represents a document
(or text entry) and each column represents a term. The matrix contains TF-IDF values that quantify the importance of each term in
each document, preparing the text data for further analysis or machine learning tasks.'''

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)


In [None]:
'''This code snippet combines features extracted from TF-IDF and BERT to create a comprehensive feature set for text data. First, the
TF-IDF matrix, which was previously computed, is converted from a sparse matrix to a dense array using 'toarray()'. This dense
representation, 'tfidf_dense', captures the term frequency-inverse document frequency values for each document. Simultaneously,
BERT embeddings, which provide contextualized representations of the text, are assumed to be available in 'bert_embeddings'.
The script then horizontally stacks ('np.hstack()') these two feature sets—TF-IDF features and BERT embeddings—into a single array
called 'combined_features'. This array integrates the information from both feature extraction methods, enhancing the dataset with
a richer and more informative set of features that can be used for further analysis or machine learning models.'''

In [None]:
# Combine TF-IDF and BERT features
tfidf_dense = tfidf_matrix.toarray()
combined_features = np.hstack((tfidf_dense, bert_embeddings))

In [None]:
'''This code snippet utilizes BERT (Bidirectional Encoder Representations from Transformers) to generate contextual embeddings for a set
of text data. It begins by loading the pre-trained BERT tokenizer and model ('bert-base-uncased'). The script processes the text data in
batches of 16 to manage memory usage and efficiency. For each batch, it tokenizes the text into a format suitable for BERT using the tokenizer,
ensuring that text sequences are truncated and padded to a maximum length of 256 tokens. The tokenized inputs are then passed through the BERT
model to obtain embeddings. Specifically, the embeddings are extracted from the model’s last hidden state, focusing on the '[CLS]' token (the
first token of each sequence) to represent the entire sequence. These embeddings are converted from PyTorch tensors to NumPy arrays and appended
to a list. After processing all batches, the list of embeddings is concatenated into a single NumPy array ('bert_embeddings'), creating a dense
representation of the text data that captures rich contextual information.'''

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

batch_size = 16
bert_embeddings = []
for i in range(0, len(text_data), batch_size):
    batch_text = text_data[i:i+batch_size]
    inputs = tokenizer(batch_text, return_tensors='pt', truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    bert_embeddings.append(batch_embeddings)

bert_embeddings = np.vstack(bert_embeddings)

In [None]:
'''This code snippet trains and evaluates a 'RandomForestClassifier' for text classification. It begins by splitting the dataset into
training and testing sets, reserving 20% for testing. The classifier is then trained on the training data and used to predict the test
data. The model’s performance is assessed by calculating accuracy and generating a detailed classification report, which includes precision,
recall, and F1-score metrics for each class. The results are printed to provide an overview of how effectively the model classifies the text data.'''

In [None]:
# Train and evaluate the model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.9647137150466045
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1419
           1       1.00      0.36      0.53        83

    accuracy                           0.96      1502
   macro avg       0.98      0.68      0.76      1502
weighted avg       0.97      0.96      0.96      1502

