# Import Part

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

### Load Data

In [3]:
data = pd.read_csv('TCP_sub.csv')
data = data[:100]
data.head()

Unnamed: 0,en_text,fa_text
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .


### Preprocessing

In [4]:
def preprocess_text_v2(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


data['processed_en_text'] = data['en_text'].apply(preprocess_text_v2)

###Calculate TF_IDF

In [5]:
tfidf_vectorizer_en = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
tfidf_matrix_en = tfidf_vectorizer_en.fit_transform(data['processed_en_text'])

### Extracting non-zero values ​​from the TF-IDF matrix

In [6]:
rows, cols = tfidf_matrix_en.nonzero()
non_zero_values = tfidf_matrix_en[rows, cols]

### Convert non zero values ​​to an array

In [7]:
non_zero_values = np.array(non_zero_values).flatten()

### Create a DataFrame of non-zero TF-IDF values

In [8]:
df_tfidf = pd.DataFrame({
    'sentence_index': rows + 1,
    'feature': [tfidf_vectorizer_en.get_feature_names_out()[col] for col in cols],
    'tfidf_value': non_zero_values
})

### Save the DataFrame as a CSV file

In [9]:
df_tfidf.to_csv('non_zero_tfidf_values.csv',index=False)