In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
!pip install pandas scikit-learn scikit-multilearn



In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from skmultilearn.problem_transform import BinaryRelevance

In [45]:
# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/ColabFiles/CategoryDataset1.csv')

# Tokenize the "Categories" column and convert to lowercase
df['categories'] = df['Categories'].apply(lambda x: [cat.strip().strip('"').lower() for cat in x.split(",")])

# Get unique categories and sort them alphabetically
all_categories = sorted(set(category for categories_list in df['categories'] for category in categories_list))

# Create a new dataframe with separate columns for each category
new_columns = {category: df['categories'].apply(lambda x: 1 if category in x else 0) for category in all_categories}
new_df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)

# Drop the original 'categories' column if needed
new_df = new_df.drop(columns=['Categories', 'categories'])

# Shuffle the rows using pandas sample method
shuffled_df = new_df.sample(frac=1, random_state=42)  # frac=1 means the entire DataFrame will be sampled

# Save the result DataFrame to a new CSV file with UTF-8 encoding
shuffled_df.to_csv('/content/drive/MyDrive/ColabFiles/CategoryDataset2.csv', index=False, encoding='utf-8')

# Display the result
print(shuffled_df)

                                              Sentence  .net  address  \
229  "IT consultant providing strategic advice and ...     0        0   
73                             "susan.white@email.com"     0        0   
521                                     "Apache Kafka"     0        0   
86                                    "(555) 234-5678"     0        0   
469                                        "Bootstrap"     0        0   
..                                                 ...   ...      ...   
71                                      "Jacob Miller"     0        0   
106                                  "(94) 71-8765432"     0        0   
270  "C# software engineer specializing in developi...     1        0   
435  "Attended a course on Docker and Kubernetes fo...     0        0   
102                                  "(94) 76-9876543"     0        0   

     agile methodologies  ai  algorithms  android  angular  api  apple  ...  \
229                    0   0           0    

In [46]:
# Load your dataset (replace 'your_dataset.csv' with your actual file)
df = pd.read_csv('/content/drive/MyDrive/ColabFiles/CategoryDataset2.csv')

print(df.shape)

df.head()

(540, 151)


Unnamed: 0,Sentence,.net,address,agile methodologies,ai,algorithms,android,angular,api,apple,...,user interface,vector graphics editor,version control,virtual reality,volunteer work,vue,web development,web server,work experience,workshops
0,"""IT consultant providing strategic advice and ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,"""susan.white@email.com""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"""Apache Kafka""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"""(555) 234-5678""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"""Bootstrap""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Exclude the first column (assuming 'Sentence' is the first column)
categories_less_than_two_rows = df.iloc[:, 1:].columns[df.iloc[:, 1:].sum(axis=0) < 3].tolist()

print("Categories with less than three rows having value 1:")
print(categories_less_than_two_rows)

Categories with less than three rows having value 1:
['azure', 'communication', 'finance', 'kubernetes', 'log management', 'messaging', 'mobile development kit', 'network protocol analyzer', 'networking', 'nosql', 'php', 'predictive modeling', 'preprocessor', 'react native', 'relational database', 'ruby on rails', 'scripting', 'social media', 'sql', 'text editor', 'vector graphics editor']


In [48]:
# Assuming 'Sentence' is your text feature column and the remaining columns are your category columns
X = df['Sentence']
y = df.iloc[:, 1:]  # Exclude the 'Sentence' column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TF-IDF to convert text data into numerical vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced', classes=[0, 1], y=y_train.values.flatten())

# Build a multi-label classification model using BinaryRelevance with SVM
classifier = BinaryRelevance(
    classifier=OneVsRestClassifier(SVC(kernel='linear', class_weight={0: class_weights[0], 1: class_weights[1]})),
    require_dense=[False, True]
)

# Train the model
classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred, zero_division=1))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7
           2       1.00      1.00      1.00         0
           3       0.00      1.00      0.00         0
           4       1.00      1.00      1.00         0
           5       1.00      1.00      1.00         1
           6       1.00      0.00      0.00         2
           7       1.00      1.00      1.00         1
           8       1.00      0.00      0.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         3
          11       1.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         0
          14       1.00      0.00      0.00         2
          15       1.00      0.50      0.67         2
          16       1.00      1.00      1.00         0
          17       1.00    

In [49]:
# Replace 'your_sentence' with the actual sentence you want predictions for
new_sentence = ["Mobile application developer who is holding a degree in Mobile Application Development with a proficiency in Kotlin and Swift."]

# Convert the new sentence to TF-IDF representation
new_sentence_tfidf = tfidf_vectorizer.transform(new_sentence)

# Initialize an empty DataFrame to store probability scores
new_sentence_prob_df = pd.DataFrame(columns=y.columns)

# Iterate through each label and get probability estimates
for label in range(y.shape[1]):
    # Train a new classifier for each label
    label_classifier = OneVsRestClassifier(SVC(kernel='linear', class_weight={0: class_weights[0], 1: class_weights[1]}))

    # Train the classifier on the entire dataset
    label_classifier.fit(X_train_tfidf, y_train.iloc[:, label])

    # Get probability estimates for the new sentence
    label_prob = label_classifier.decision_function(new_sentence_tfidf)

    # Add the probability estimates to the DataFrame
    new_sentence_prob_df[y.columns[label]] = label_prob

# Apply a sigmoid function to obtain probabilities between 0 and 1
new_sentence_prob_df = 1 / (1 + 10**(-new_sentence_prob_df))

# Get binary predictions for the new sentence
new_sentence_pred = (new_sentence_prob_df > 0.5).astype(int)

# Reshape the probability DataFrame for better readability
new_sentence_prob_table = new_sentence_prob_df.melt(var_name='Label', value_name='Prediction Score')

# Concatenate binary predictions to the DataFrame
new_sentence_prob_table['Prediction'] = new_sentence_pred.values.flatten()

# Filter for rows where the prediction is 1
new_sentence_prob_table = new_sentence_prob_table[new_sentence_prob_table['Prediction Score'] > 0.5]

# Display the table
print("Predictions for the new sentence:")
print(new_sentence_prob_table)



Predictions for the new sentence:
                    Label  Prediction Score  Prediction
43                 degree          0.760221           1
51              education          0.884699           1
75                 kotlin          0.671074           1
87                 mobile          0.934070           1
127  software development          0.968874           1
129                 swift          0.505115           1
148       work experience          0.900891           1
