## Step 1: Setup Python Packages

In [126]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step 2: Load the Dataset

In [127]:
df = pd.read_csv('Demo Profiles.csv')

In [128]:
df.head()

Unnamed: 0,first_name,last_name,company,position,industry,location
0,John,Doe,ABC Corp,Marketing Manager,Technology,San Francisco
1,Jane,Smith,XYZ Inc,Social Media Specialist,Advertising & Marketing,New York
2,Michael,Johnson,123 Company,Digital Marketing Analyst,Consulting,Chicago
3,Sarah,Williams,ABC Corp,Content Writer,Media & Publishing,London
4,David,Brown,XYZ Inc,Brand Manager,Consumer Goods,Miami


## Step 3: Text Preprocessing Techniques

In [129]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed)

## Step 4: Apply Preprocessing Techniques

In [130]:
# Apply preprocessing to the position column
df['processed_position'] = df['position'].apply(preprocess_text)

In [131]:
df.head()

Unnamed: 0,first_name,last_name,company,position,industry,location,processed_position
0,John,Doe,ABC Corp,Marketing Manager,Technology,San Francisco,market manag
1,Jane,Smith,XYZ Inc,Social Media Specialist,Advertising & Marketing,New York,social media specialist
2,Michael,Johnson,123 Company,Digital Marketing Analyst,Consulting,Chicago,digit market analyst
3,Sarah,Williams,ABC Corp,Content Writer,Media & Publishing,London,content writer
4,David,Brown,XYZ Inc,Brand Manager,Consumer Goods,Miami,brand manag


## Step 5: Process of Text Vectorization

**Option A: Bag-of-Words**

In [132]:
# Vectorization using Bag-of-Words model
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['processed_position'])

In [133]:
# Convert to Array
bow_array = bow_matrix.toarray()
print(bow_array)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]]


**Option B: TfidfVectorizer**

In [134]:
# Vectorization using TfidfVectorizer model
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['processed_position'])

In [135]:
# Convert to Array
X_array = X.toarray()
print(X_array)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.60225663 0.         0.        ]
 [0.         0.         0.62026425 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.86288949 0.        ]
 [0.         0.         0.         ... 0.60225663 0.         0.        ]]


## Step 6: Normalize the Vectorized Data

In [136]:
normalized_matrix = normalize(X, norm='l2', axis=1)

In [137]:
normalized_array = normalized_matrix.toarray()
print(normalized_array)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.60225663 0.         0.        ]
 [0.         0.         0.62026425 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.86288949 0.        ]
 [0.         0.         0.         ... 0.60225663 0.         0.        ]]


## Step 7: Encode the Target Variable

In [138]:
unique_values = df['industry'].unique()

In [139]:
for i, value in enumerate(unique_values, 0):
    print(f"{i}. {value}")

0. Technology
1. Advertising & Marketing
2. Consulting
3. Media & Publishing
4. Consumer Goods
5. E-commerce
6. Fashion & Apparel
7. Beauty & Cosmetics
8. Market Research
9.  Marketing Coordinator


In [140]:
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['industry'])

In [141]:
print(y)

[9 1 3 8 4 5 6 9 1 3 8 4 2 7 9 1 8 5 6 9 3 8 4 1 5 6 9 1 3 8 4 9 5 6 8 4 2
 7 9 1 8 5 6 9 3 8 4 1 5 6 9 1 3 8 4 9 5 6 8 4 2 7 9 1 8 5 6 9 3 8 4 1 5 6
 9 1 3 8 4 9 5 6 8 4 2 7 9 1 0 5 6 9 3 8 4 1 5 6 9 1]
