In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [11]:
def build_TF_IDF_X(data, svd_components=150, X_addition=None):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=5, norm=None) #, max_features=3000
    tfidf_matrix = vectorizer.fit_transform(data['Description'].str.lower())
    print(f"TfidfVectorizer size: {tfidf_matrix.shape}")

    feature_names = vectorizer.get_feature_names_out()
    # Compute the average TF-IDF score for each word
    avg_tfidf_scores = tfidf_matrix.mean(axis=0).A1  
    keywords_df = pd.DataFrame({'Word': feature_names, 'TF-IDF Score': avg_tfidf_scores})
    top_keywords = keywords_df.sort_values(by='TF-IDF Score', ascending=False)

    # Apply TruncatedSVD to reduce dimensionality
    svd = TruncatedSVD(n_components=svd_components)  # Adjust n_components as needed
    X_text_svd = svd.fit_transform(tfidf_matrix)

    # Convert reduced text data to DataFrame
    # X_text_reduced = pd.DataFrame(X_text_svd, columns=[f'V_{i}' for i in range(X_text_svd.shape[1])])
    X_text_reduced = pd.DataFrame(X_text_svd, columns=[f'SVD{i}' for i in range(X_text_svd.shape[1])])

    # X_text = pd.DataFrame(X_text_svd.toarray(), columns=vectorizer.get_feature_names_out())

    # Combine reduced text features with other features
    if X_addition is None:
        X = X_text_reduced
    else:
        X = pd.concat([X_text_reduced, X_addition.reset_index(drop=True)], axis=1)

    return [X, top_keywords]

In [12]:
data = pd.read_csv('./cleaned_data.csv')

In [13]:
X, top_keywords = build_TF_IDF_X(data)

TfidfVectorizer size: (33129, 1360)


In [18]:
top_keywords.to_csv('./top_keywords.csv',index = False)

In [19]:
top_keywords

Unnamed: 0,Word,TF-IDF Score
700,exit,1.193164
359,75,1.048150
372,85,0.977337
1131,rd,0.890573
206,285,0.879330
...,...,...
1211,slappey,0.001451
1213,slygo,0.001451
861,ishman,0.001451
526,bungalow,0.001451


In [15]:
X

Unnamed: 0,SVD0,SVD1,SVD2,SVD3,SVD4,SVD5,SVD6,SVD7,SVD8,SVD9,...,SVD140,SVD141,SVD142,SVD143,SVD144,SVD145,SVD146,SVD147,SVD148,SVD149
0,1.219935,1.023620,0.328884,2.949133,0.645708,-0.398811,-0.019072,-0.063675,0.574138,-0.301651,...,1.799562,-0.710454,-0.738691,-0.125417,-0.208435,-0.009300,1.024113,0.202659,0.622781,0.903744
1,1.598908,1.374566,0.217976,3.407339,0.135379,-0.574206,-0.094203,-0.233906,0.204430,-0.453246,...,1.129656,-0.364004,-0.319284,0.196731,-0.639400,-0.594372,0.509731,0.093116,0.399937,0.703174
2,4.044009,1.997650,0.149110,2.830380,-0.847173,2.567769,-1.077835,0.068358,-0.684847,1.401567,...,-0.080105,-0.177921,-0.068332,-0.094781,-0.482450,0.174508,-1.682735,0.224553,0.088991,-0.308342
3,3.091373,3.477369,-0.520177,3.900669,1.019787,-1.491720,-0.418964,-1.663939,1.650300,1.977708,...,-0.020521,1.558784,0.032732,0.098555,0.202283,-0.103297,0.682027,1.164533,-0.136471,0.688376
4,2.216008,1.646127,-0.055347,3.630583,-0.130941,2.004500,0.429704,0.882279,-0.532485,0.551580,...,-0.249605,0.171150,0.781727,-0.082951,-0.207301,0.298235,-0.654986,0.116892,0.224136,0.184152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33124,1.807004,2.000471,2.750141,-0.659555,-0.144882,0.413816,3.680352,-0.657151,-2.240720,0.464924,...,-0.154570,-0.047672,0.015558,0.065125,0.159296,-0.082913,-0.036018,-0.273329,-0.136155,-0.031619
33125,3.371171,0.973999,0.015900,-0.187691,-1.316247,0.735017,-0.621977,-1.522268,1.321251,0.564768,...,-1.575711,1.573188,-0.761629,0.457275,0.048449,-0.083841,-1.491144,-0.034260,-0.483507,0.285429
33126,0.566046,0.526389,1.102546,2.314748,0.585019,-0.268334,-0.657268,0.105750,0.271732,-0.312271,...,-0.149145,0.088081,0.194952,0.292886,0.114773,0.129979,-0.092140,-0.107201,0.418899,-0.090181
33127,11.869317,-7.380556,-0.843153,-0.955940,-0.315022,-1.367946,-0.060853,1.605087,-1.657071,2.103131,...,-0.369866,-0.899364,-0.222465,1.098843,2.181466,-1.482185,-0.571611,0.531060,0.387176,-1.553776


In [16]:
X.to_csv('./text_mining.csv',index = False)