<a href="https://colab.research.google.com/github/Rafiy27/Tugas-Akhir-Hate-Speech/blob/main/Corpus_Building_Feature_Extraction_(FastText)_Feature_Expansion_(TF_IDF)_Tweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Library

In [None]:
!pip install pandas openpyxl



In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199772 sha256=b69f859b327ba82cf30b486f3ca9dbf190b285fd388e708387ba8e2bd4ecdcdc
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


# Load & Preprocess Data Tweet

## Import Library & Load Dataset

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset from Excel
df = pd.read_excel('modified_dataset - Copy - Copy.xlsx', engine='openpyxl')

In [None]:
# Filter rows where 'full_text' is not a string
error_rows = df[df['full_text'].apply(lambda x: not isinstance(x, str))]

# Display the rows with the error
print(error_rows)

         No.               id_str  conversation_id_str full_text  Label_Putri  \
8927    8928  1716550000000000000  1716050000000000000       NaN          NaN   
16514  16515             1,65E+18             1,65E+18       NaN          NaN   
16515  16516             1,65E+18             1,65E+18       NaN          NaN   
16516  16517             1,65E+18             1,65E+18       NaN          NaN   
16517  16518             1,65E+18             1,65E+18       NaN          NaN   
16518  16519             1,65E+18             1,65E+18       NaN          NaN   
16519  16520             1,65E+18             1,65E+18       NaN          NaN   

       Label_Dea  Label_Rafi  Label_Final  
8927         NaN         0.0            0  
16514        NaN         NaN            0  
16515        NaN         NaN            1  
16516        NaN         NaN            0  
16517        NaN         NaN            0  
16518        NaN         NaN            1  
16519        NaN         NaN            1  

In [None]:
df.dropna(subset=['full_text'], inplace=True)

## Preprocess Data Tweet

In [None]:
# Data Cleaning
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-word characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove emojis
    return text

df['full_text'] = df['full_text'].apply(clean_text)

# Case Folding
df['full_text'] = df['full_text'].str.lower()

# Tokenizing
nltk.download('punkt')
df['tokens'] = df['full_text'].apply(word_tokenize)

# Filtering with Stop Words
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
df['filtered_tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
df['stemmed_text'] = df['filtered_tokens'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x]))

# Extracting labels (assuming they are in a column named 'Label_Final')
labels = df['Label_Final']

# Display the processed data (optional)
print(df[['full_text', 'stemmed_text', 'Label_Final']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                           full_text  \
0               android_ak_47 anies orang campus aja   
1  pak anies juga blm masa kampanye tapi bawaslu ...   
2  kosim__ yang ngusung anies sbg gub dki itu pks...   
3  android_ak_47 saya malu klo tdk milih anies me...   
4  msaid_didu entah siapa yg salah ttg kegagalan ...   

                                        stemmed_text  Label_Final  
0                  android_ak_47 ani orang campu aja            0  
1  ani blm kampany bawaslu sdh edar sm berantai m...            0  
2  kosim__ ngusung ani sbg gub dki pk gerindrapra...            0  
3  android_ak_47 malu klo tdk milih ani kerusakan...            0  
4  msaid_didu yg salah ttg kegagalan food estat y...            0  


# Save Corpus

In [None]:
# Concatenate all 'stemmed_text' entries to create a corpus
corpus = ' '.join(df['stemmed_text'].astype(str))

# Save the corpus to a text file (optional)
with open('corpus.txt', 'w', encoding='utf-8') as file:
    file.write(corpus)

print("Corpus created and saved as 'corpus.txt'.")

Corpus created and saved as 'corpus.txt'.


# Train FastText Model

In [None]:
import fasttext

# Path to the corpus file
corpus_path = 'corpus.txt'

# Output path for the trained FastText model
model_path = 'trained_model.bin'

# Train the FastText model
# Here, we're using default parameters. You can adjust them based on your requirements.
model = fasttext.train_unsupervised(corpus_path, model='skipgram')

# Save the trained model
model.save_model(model_path)

print("Model trained and saved successfully!")

Model trained and saved successfully!


In [None]:
# Load the trained FastText model
model = fasttext.load_model(model_path)



## Ekstraksi Fitur

In [None]:
# Function to extract feature vectors for a given text
def extract_features(text):
    words = text.split()
    feature_vectors = []
    for word in words:
        vector = model.get_word_vector(word)
        feature_vectors.append(vector)
    return feature_vectors

# Apply feature extraction to the 'stemmed_text' column
df['feature_vectors'] = df['stemmed_text'].apply(extract_features)

# Convert the list of feature vectors into a single array for each row
df['feature_vectors'] = df['feature_vectors'].apply(np.array)

# Save the DataFrame with feature vectors to a CSV file (optional)
df.to_csv('data_with_features.csv', index=False)

# Display the DataFrame with feature vectors (optional)
print(df[['full_text', 'stemmed_text', 'Label_Final', 'feature_vectors']].head())

                                           full_text  \
0               android_ak_47 anies orang campus aja   
1  pak anies juga blm masa kampanye tapi bawaslu ...   
2  kosim__ yang ngusung anies sbg gub dki itu pks...   
3  android_ak_47 saya malu klo tdk milih anies me...   
4  msaid_didu entah siapa yg salah ttg kegagalan ...   

                                        stemmed_text  Label_Final  \
0                  android_ak_47 ani orang campu aja            0   
1  ani blm kampany bawaslu sdh edar sm berantai m...            0   
2  kosim__ ngusung ani sbg gub dki pk gerindrapra...            0   
3  android_ak_47 malu klo tdk milih ani kerusakan...            0   
4  msaid_didu yg salah ttg kegagalan food estat y...            0   

                                     feature_vectors  
0  [[0.07310548, -0.019598916, 0.30345085, 0.1652...  
1  [[0.0022707172, 0.028685242, -0.18997926, -0.2...  
2  [[-0.41321686, -0.32017013, -0.16772135, -0.20...  
3  [[0.07310548, -0.01959891

# Ekspansi Fitur (TF IDF)

In [None]:
# Initialize the TF-IDF vectorizer with max_features set to 5000
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the 'stemmed_text' to compute TF-IDF scores
tfidf_matrix = tfidf_vectorizer.fit_transform(df['stemmed_text'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the original DataFrame with the TF-IDF DataFrame
df_combined = pd.concat([df, tfidf_df], axis=1)

# Save the combined DataFrame to a CSV file (optional)
df_combined.to_csv('data_with_features_and_tfidf.csv', index=False)

# Display the combined DataFrame (optional)
print(df_combined[['full_text', 'stemmed_text', 'Label_Final'] + list(tfidf_vectorizer.get_feature_names_out())].head())

                                           full_text  \
0               android_ak_47 anies orang campus aja   
1  pak anies juga blm masa kampanye tapi bawaslu ...   
2  kosim__ yang ngusung anies sbg gub dki itu pks...   
3  android_ak_47 saya malu klo tdk milih anies me...   
4  msaid_didu entah siapa yg salah ttg kegagalan ...   

                                        stemmed_text  Label_Final   01   02  \
0                  android_ak_47 ani orang campu aja          0.0  0.0  0.0   
1  ani blm kampany bawaslu sdh edar sm berantai m...          0.0  0.0  0.0   
2  kosim__ ngusung ani sbg gub dki pk gerindrapra...          0.0  0.0  0.0   
3  android_ak_47 malu klo tdk milih ani kerusakan...          0.0  0.0  0.0   
4  msaid_didu yg salah ttg kegagalan food estat y...          0.0  0.0  0.0   

   03__nakula   10  100  1000   11  ...  zioni  zoelfick  zoey  zon  zonauang  \
0         0.0  0.0  0.0   0.0  0.0  ...    0.0       0.0   0.0  0.0       0.0   
1         0.0  0.0  0.0   

In [None]:
# Display the combined DataFrame (optional)
print(df_combined[['full_text', 'stemmed_text', 'Label_Final'] + list(tfidf_vectorizer.get_feature_names_out())].head())

                                           full_text  \
0               android_ak_47 anies orang campus aja   
1  pak anies juga blm masa kampanye tapi bawaslu ...   
2  kosim__ yang ngusung anies sbg gub dki itu pks...   
3  android_ak_47 saya malu klo tdk milih anies me...   
4  msaid_didu entah siapa yg salah ttg kegagalan ...   

                                        stemmed_text  Label_Final   01   02  \
0                  android_ak_47 ani orang campu aja          0.0  0.0  0.0   
1  ani blm kampany bawaslu sdh edar sm berantai m...          0.0  0.0  0.0   
2  kosim__ ngusung ani sbg gub dki pk gerindrapra...          0.0  0.0  0.0   
3  android_ak_47 malu klo tdk milih ani kerusakan...          0.0  0.0  0.0   
4  msaid_didu yg salah ttg kegagalan food estat y...          0.0  0.0  0.0   

   03__nakula   10  100  1000   11  ...  zioni  zoelfick  zoey  zon  zonauang  \
0         0.0  0.0  0.0   0.0  0.0  ...    0.0       0.0   0.0  0.0       0.0   
1         0.0  0.0  0.0   

In [None]:
# Print model information
print(f"Dimension of word vectors: {model.get_dimension()}")
print(f"Number of words in the model's vocabulary: {len(model.words)}")

# Display a subset of the vocabulary (first 10 words as an example)
print("\nSample vocabulary:")
for word in model.words[:10]:
    print(word)

Dimension of word vectors: 100
Number of words in the model's vocabulary: 10790

Sample vocabulary:
polisi
yg
rt
agama
ani
bangsat
kontol
lu
jokowi
orang


In [None]:
print(model.get_nearest_neighbors('Anies'))

[(0.9315925240516663, 'aniessandi'), (0.931538462638855, 'aniesngibul'), (0.9249277710914612, 'aniespenipu'), (0.9236127138137817, 'aniesgaben'), (0.9234165549278259, 'anies_relawan'), (0.9165322184562683, 'aniesmania'), (0.9150654673576355, 'aniesbusuk'), (0.9149628281593323, 'aniesimin'), (0.9147285223007202, 'aniesgabecu'), (0.9102646708488464, 'aniesygmani')]


In [None]:
# Extract word vectors and words
words = model.words
vectors = [model.get_word_vector(word) for word in words]

# Create a DataFrame to store the word vectors
word_vectors_df = pd.DataFrame(np.array(vectors), index=words)

# Save the DataFrame to a CSV file (optional)
word_vectors_df.to_csv('word_vectors.csv')

# Display the DataFrame (optional)
print(word_vectors_df.head())

              0         1         2         3         4         5         6   \
polisi -0.576942 -0.616024  0.342153 -0.397687 -0.030046 -0.608560 -0.352871   
yg      0.062726 -0.180544  0.290070  0.164792 -0.065855  0.049383 -0.099984   
rt      0.111445 -0.157175  0.368534  0.368823  0.162012  0.143876  0.496929   
agama   0.351173  0.219019  0.698730  0.411099  0.049002 -0.078647  0.833379   
ani     0.002271  0.028685 -0.189979 -0.276333 -0.221277  0.140011 -0.878774   

              7         8         9   ...        90        91        92  \
polisi -0.088243  0.589864 -0.656664  ...  0.559057  0.546212  0.026423   
yg     -0.152045  0.199241 -0.373167  ...  0.319712 -0.169090 -0.031436   
rt      0.025692  0.085010 -0.146897  ...  0.003236 -0.264650  0.134863   
agama   0.058797  0.138839 -0.136421  ...  0.091312  0.099400  0.194005   
ani     0.244312  0.541722  0.349854  ...  1.138707 -0.103604 -0.012713   

              93        94        95        96        97        98  