# Packages

In [1]:
%%capture
!pip install numpy==1.26.4
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
!pip install transformers
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate
!pip install -U bitsandbytes
!pip install pandas

In [2]:
import torch
# Determine CUDA version and Pytorch version
cuda_version = torch.version.cuda
pytorch_version = torch.__version__
major_version, minor_version = torch.cuda.get_device_capability()
print(f"CUDA version: {cuda_version}")
print(f"PyTorch version: {pytorch_version}")
print(f"CUDA Device compute capability: {major_version}.{minor_version}")

CUDA version: 12.1
PyTorch version: 2.3.0+cu121
CUDA Device compute capability: 8.9


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load the Dataset

In [4]:
from datasets import load_dataset
dataset_1 = load_dataset("embedding-data/QQP_triplets", split = "train", cache_dir='cache_dir_dataset').to_pandas()
dataset_1.head()

Downloading readme:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Unnamed: 0,set
0,{'query': 'Why in India do we not have one on ...
1,"{'query': 'What is OnePlus One?', 'pos': ['How..."
2,{'query': 'Does our mind control our emotions?...
3,{'query': 'What made Islam spread around the w...
4,{'query': 'What do people around the world typ...


In [39]:
df = dataset_1

In [40]:
import pandas as pd
df = pd.concat([df.drop('set', axis=1), df['set'].apply(pd.Series)], axis=1)
df = pd.concat([df.drop('pos', axis=1), df['pos'].apply(pd.Series)], axis=1)
df.rename(columns = {0:'positive'}, inplace = True)
df = pd.concat([df.drop('neg', axis=1), df['neg'].apply(pd.Series)], axis=1)
df[:2]

Unnamed: 0,query,positive,0,1,2,3,4,5,6,7,...,20,21,22,23,24,25,26,27,28,29
0,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,Can people on Quora stop India Pakistan debate...,"Why do politicians, instead of having a decent...",Can educated politicians make a difference in ...,What are some unusual aspects about politics a...,What is debate?,Why does civic public communication and discou...,What is a Parliamentary debate?,Why do we always have two candidates at the U....,...,Who is the best politician in India? Why?,We all know about the present condition of Ind...,Who are clean politicians in India?,Why are you not believing in Democracy of India?,What does politics in India mean? What are the...,What are the strongest arguments for a debate ...,Do we really need an election commission in In...,Why is there no concept of political correctne...,Why is population control not on agenda of any...,Who are some of the most dangerous or worst po...
1,What is OnePlus One?,How is oneplus one?,Why is OnePlus One so good?,What would OnePlus 4 be like?,What does the logo of oneplus phones mean?,Should I root my OnePlus One?,Do you like the OnePlus 2? Would you buy it?,Which one do you prefer: the OnePlus Two or Mo...,What is 1Google?,Which root is stable for OnePlus Two?,...,What type of dog is this one?,What is 1g 2g 3g and 4g?,What does 1.1i mean?,As a user what is your review of OnePlus 3 inc...,What are some examples of one-celled organisms?,"What is"" one India one election"" concept?",Why doesn't the OnePlus One have functions lik...,What is the difference between the OnePlus two...,Is it a better idea to buy oneplus3 or I shoul...,What is a double?


In [41]:
from numpy import nan
data = []
for index, row in df.iterrows():
    for i in range(3):
        # df3 = pd.DataFrame([row["word", row[i]], columns=['word', 'values'])
        data.append([row["query"],row["positive"], row[i]])
        # print(row[i])
df = pd.DataFrame(data)

In [42]:
df.head()

Unnamed: 0,0,1,2
0,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,Can people on Quora stop India Pakistan debate...
1,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,"Why do politicians, instead of having a decent..."
2,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,Can educated politicians make a difference in ...
3,What is OnePlus One?,How is oneplus one?,Why is OnePlus One so good?
4,What is OnePlus One?,How is oneplus one?,What would OnePlus 4 be like?


In [43]:
df.shape[0]

305286

In [44]:
# droping any row with null value
temp = df.shape[0]
null_count_before = df.isnull().sum().sum()
df = df.dropna(axis=0)
df= df.reset_index(drop =True) # index is all messed up after drop so we resetindex.
print("The number of Dropped rows:",temp-df.shape[0])
null_count_dataset_1 = dataset_1.isnull().sum().sum()
null_count_df = df.isnull().sum().sum()
print("total number of null values in df_before and df:",null_count_before, "and", null_count_df)

The number of Dropped rows: 2888
total number of null values in df_before and df: 2888 and 0


In [45]:
df.rename(columns = {0:'anchor'}, inplace = True)
df.rename(columns = {1:'positive'}, inplace = True)
df.rename(columns = {2:'negative'}, inplace = True)

In [46]:
# counting total character in dataset
total_character = 0
for index_1, row in df.iterrows():
    total_character = len(row["anchor"]) + len(row["positive"]) + len(row["negative"]) + total_character
print(total_character)

48484851


In [47]:
# counting the number of value we won't going to translate
no_translate = 0
for index_1, row in df.iterrows():
    if (len(row["anchor"]) == 0):
        no_translate += 1
    if (len(row["positive"]) == 0):
        no_translate += 1
    if (len(row["negative"]) == 0):
        no_translate += 1
print(no_translate)

0


In [48]:
# len of dataset
len(df), len(dataset_1)

(302398, 101762)

In [49]:
df.head()

Unnamed: 0,anchor,positive,negative
0,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,Can people on Quora stop India Pakistan debate...
1,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,"Why do politicians, instead of having a decent..."
2,Why in India do we not have one on one politic...,Why cant we have a public debate between polit...,Can educated politicians make a difference in ...
3,What is OnePlus One?,How is oneplus one?,Why is OnePlus One so good?
4,What is OnePlus One?,How is oneplus one?,What would OnePlus 4 be like?


In [50]:
# rechecking the null value
null_count_df = df.isnull().sum().sum()
null_count_df

0

In [51]:
df_1 = pd.DataFrame(columns=['anchor', 'positive', 'negative'])

In [52]:
# Checking the max_length in whole dataset
max_length = 0
max_length_index = [0]
for index, row in df.iterrows():
    if (len(row["anchor"])> max_length):
        max_length = len(row["anchor"])
        max_length_index.append(index)
    if (len(row["positive"])> max_length):
        max_length = len(row["positive"])
        max_length_index.append(index)
    if (len(row["negative"])> max_length):
        max_length = len(row["negative"])
        max_length_index.append(index)
    
    
print(max_length, max_length_index[-1])

1169 260480


In [54]:
df.iloc[max_length_index[-1]]["positiv"]

'I have been married almost 15 years. We have two children, 11 and 8. My childhood was horrible, seven years of sexual, physical, and mental/emotional abuse followed by instability and low self esteem issues as a teen. My husband met me after I began dealing with my childhood and knew all about my issues. He knew what would end our marriage, yet he did it and lied about it multiple times. I was pregnant and did not leave. We tried to make it work. We had another child. We have been through MANY "situations" in our marriage and as parents. For the past 6-8 years we have just been living as parents. Not lovers. In the past I tried everything to make our marriage work. I finally decided to be done with it. Now he says he loves me and wants to make it work. But he is manipulative and can be so mean. He uses my children to cause me pain and stress. He knows that they are the only reason I stay. We are in counciling, but I really have no "love" feelings for him. I told him I want a divorce, 

In [56]:
#filter for small length
limit_length = 500
for index, row in df[:100000].iterrows():
    if (len(row["anchor"])> limit_length):
        continue
    if (len(row["positive"])> max_length):
        continue
    if (len(row["negative"])> max_length):
        continue
    new_dic = {"anchor": [row["anchor"]], "positive": [row["positive"]], "negative": [row["negative"]] }
    temp_df = pd.DataFrame(new_dic)
    df_1 = pd.concat([df_1, temp_df], ignore_index = True)
    df_1.reset_index()

In [57]:
df_1.tail()

Unnamed: 0,anchor,positive,negative
99995,How can I study more hard?,How do you study effectively?,How long should I study?
99996,Are protests against Trump's presidency justif...,Are riots against Trump's presidency justified?,Why have the number of race riots increased du...
99997,Are protests against Trump's presidency justif...,Are riots against Trump's presidency justified?,Will the recent violence directed at protester...
99998,Are protests against Trump's presidency justif...,Are riots against Trump's presidency justified?,Were there riots when Obama won?
99999,What was your most embarrassing moment infront...,What is the most embarrassing moment in front ...,Which was the most embarrassing moment in fron...


In [59]:
len(df_1)

100000

# Load the model

In [58]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "universalml0/NepaliGPT-2.0-finetuned",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    cache_dir = 'cache_dir',
    token = "", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.964 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [66]:
translation_prompt = """Below is instruction that describes a task, paired with input which provide further context. Write a response  that appropriately complete the request.

### Instruction:
Your task is only to do the translation of given sentences or questions to Nepali language. Donot lose the context or meaning of senteces of questions while translation. Translate the following English sentences or questions to Nepali language: {}

### Input:


### Response:
{}"""

In [79]:
def unsloth_llm(question):
    if (len(question) > 0):
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
        positives = tokenizer(
        [
            translation_prompt.format(
                question,
                '',
            )
        ], return_tensors = "pt").to("cuda")

        negatives = model.generate(**positives, max_new_tokens = 2048, use_cache = True,temperature=0.1)
        generated_text=tokenizer.batch_decode(negatives, skip_special_tokens=True)
        x = generated_text[0].split('### Response:\n')
        return x[1]
    else:
        return ""

In [93]:
unsloth_llm("""What is the meaning of this science""")

'विज्ञानको अर्थ के हो?'

In [None]:
# a = """महान् मन्दी एक आर्थिक अवस्था थियो जुन 1929-1939 सम्म चलेको थियो, जसले यसलाई आधुनिक इतिहासमा सबैभन्दा लामो मन्दी बनायो। यसले संयुक्त राज्य अमेरिका मात्र होइन तर विश्वभरका देशहरूलाई पनि असर गऱ्यो, जसले महत्त्वपूर्ण सामाजिक र आर्थिक विघटन निम्त्यायो। महान् मन्दीका धेरै प्रमुख कारणहरू थिए, जसमा समावेश छन्: १. सन् १९२९ को शेयर बजार दुर्घटना – अक्टोबर १९२९ मा, शेयर बजारले महत्त्वपूर्ण गिरावट को अनुभव गर्यो, जसले लाखौं लगानीकर्ताहरूलाई नष्ट गर्यो। यो घटनालाई धेरैले महान् मन्दीको प्रारम्भिक ट्रिगर मान्छन्। २. ओभरप्रोडक्सन – १९२० को दशकमा, धेरै उद्योगहरूले उपभोक्ताहरूले चाहेको वा खर्च गर्न सक्ने भन्दा बढी सामानहरू उत्पादन गरे। यसले अन्ततः मागमा गिरावट ल्यायो, जसले गर्दा रोजगारी गुम्यो, ज्याला घट्यो र व्यवसायहरू असफल भए। ३. उच्च ट्यारिफ र युद्ध ऋण – संरक्षणवादी व्यापार नीतिहरूको रूपमा उच्च ट्यारिफले विश्वव्यापी व्यापारमा गिरावट ल्यायो, किनकि अन्य देशहरूले प्रतिक्रियामा आफ्नै ट्यारिफ हरू बढाए। यसका अतिरिक्त, धेरै देशहरू युद्धको ऋण तिर्न संघर्ष गरिरहेका थिए, जसले आर्थिक अस्थिरता निम्त्यायो। ४. बैंकको असफलता – जब मागमा गिरावट आयो, धेरै बैंकहरू असफल भए, जसले गर्दा बैंकिङ प्रणालीमा विश्वास गुम्यो। यसले गर्दा बैंकहरूबाट पैसा निकाल्ने र अझ धेरै बैंकहरू असफल हुने क्रम बढ्यो। ५. खडेरीको अवस्था – धूलो बाउल एक गम्भीर खडेरी र धूलो आँधी थियो जुन संयुक्त राज्य अमेरिकाको ग्रेट प्लेन्स क्षेत्रमा १९३० को दशकमा आयो। यसले कृषिमा महत्त्वपूर्ण प्रभाव पार्यो, जसले गर्दा धेरै किसानहरूले आफ्नो जमिन गुमाए र जीविकोपार्जन गुमाए, जसले गर्दा मन्दीको प्रभाव अझ खराब भयो। समग्रमा महान् मन्दी विभिन्न कारणहरूद्वारा प्रेरित भएको थियो, जसमा आर्थिक, वातावरणीय र राजनीतिक कारकहरू समावेश थिए। यसको प्रभाव विश्वव्यापी थियो, जसले लाखौं मानिसहरूलाई असर गरेको थियो।"""

In [None]:
# len(tokenizer.tokenize(a))

# for Initial (save the out.csv)

In [94]:
df_2 = pd.DataFrame(columns=['anchor', 'positive', 'negative'])

In [95]:
%%time
for index, row in df_1[:2].iterrows():
    
    nepali_anchor = unsloth_llm(row["anchor"])
    nepali_positive = unsloth_llm(row["positive"])
    nepali_negative = unsloth_llm(row["negative"])
    
    new_dic = {"anchor": [nepali_anchor], "positive": [nepali_positive], "negative": [nepali_negative] }
    temp_df = pd.DataFrame(new_dic)
    # display(temp_df)

    df_2 = pd.concat([df_2, temp_df], ignore_index = True)
    df_2.reset_index()
    df_2.to_csv('out.csv')
    print(index)

0
1
CPU times: user 19.5 s, sys: 2.36 ms, total: 19.5 s
Wall time: 19.5 s


In [96]:
df_2.tail()

Unnamed: 0,anchor,positive,negative
0,भारतमा हामीले अमेरिकामा जस्तै एक-अर्कासँग राजन...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,के क्वोरामा भारत-पाकिस्तान बहस रोक्न सकिन्छ? ह...
1,भारतमा हामीले अमेरिकी शैलीको एक-देखि-एक राजनीत...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,राजनीतिज्ञहरूले किन विश्वभरि भइरहेका मुद्दाहरू...


In [98]:
df_2.iloc[1]["negative"]==df_2.iloc[0]["negative"]

False

# From load from csv

In [99]:
import pandas as pd

df_2 = pd.read_csv('out.csv')
df_2 = df_2.drop(columns = ['Unnamed: 0'])
df_2 = df_2.fillna('')
df_2.head()

Unnamed: 0,anchor,positive,negative
0,भारतमा हामीले अमेरिकामा जस्तै एक-अर्कासँग राजन...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,के क्वोरामा भारत-पाकिस्तान बहस रोक्न सकिन्छ? ह...
1,भारतमा हामीले अमेरिकी शैलीको एक-देखि-एक राजनीत...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,राजनीतिज्ञहरूले किन विश्वभरि भइरहेका मुद्दाहरू...


In [100]:
df_2.tail()

Unnamed: 0,anchor,positive,negative
0,भारतमा हामीले अमेरिकामा जस्तै एक-अर्कासँग राजन...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,के क्वोरामा भारत-पाकिस्तान बहस रोक्न सकिन्छ? ह...
1,भारतमा हामीले अमेरिकी शैलीको एक-देखि-एक राजनीत...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,राजनीतिज्ञहरूले किन विश्वभरि भइरहेका मुद्दाहरू...


In [101]:
# rechecking the null value
null_count_df_2 = df_2.isnull().sum().sum()
null_count_df_2

0

In [102]:
len(df_2)

2

In [None]:
# %%time
for index, row in df_1[2:100000].iterrows():
    
    if(df_1.iloc[index-1]["anchor"]==df_1.iloc[index]["anchor"]):
        nepali_anchor = df_2.iloc[index-1]["anchor"]
    else:
        nepali_anchor = unsloth_llm(row["anchor"])
    
    if(df_1.iloc[index-1]["positive"]==df_1.iloc[index]["positive"]):
        nepali_positive = df_2.iloc[index-1]["positive"]
    else:
        nepali_positive = unsloth_llm(row["positive"])
    
    if(df_1.iloc[index-1]["negative"]==df_1.iloc[index]["negative"]):
        nepali_negative = df_2.iloc[index-1]["negative"]
    else:
        nepali_negative = unsloth_llm(row["negative"])
    
    new_dic = {"anchor": [nepali_anchor], "positive": [nepali_positive], "negative": [nepali_negative] }
    temp_df = pd.DataFrame(new_dic)
    # display(temp_df)

    df_2 = pd.concat([df_2, temp_df], ignore_index = True)
    df_2.reset_index()
    df_2.to_csv('out.csv')
    print(index)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268


# Save to Huggingface

In [105]:
import pandas as pd

df_3 = pd.read_csv('out.csv')
df_3 = df_3.drop(columns = ['Unnamed: 0'])
null_count = df_3.isnull().sum().sum()
print(null_count)
df_3 = df_3.dropna(axis=0)
df_3.head()

0


Unnamed: 0,anchor,positive,negative
0,भारतमा हामीले अमेरिकामा जस्तै एक-अर्कासँग राजन...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,के क्वोरामा भारत-पाकिस्तान बहस रोक्न सकिन्छ? ह...
1,भारतमा हामीले अमेरिकी शैलीको एक-देखि-एक राजनीत...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,राजनीतिज्ञहरूले किन विश्वभरि भइरहेका मुद्दाहरू...
2,भारतमा हामीले अमेरिकी शैलीको एक-देखि-एक राजनीत...,भारतमा राजनीतिज्ञहरूबीच संयुक्त राज्य अमेरिकाम...,भारतमा शिक्षित राजनीतिज्ञहरूले फरक पार्न सक्छन...
3,वनप्लस वन के हो?,ओनप्लस वन के हो?,वनप्लस वन किन यति राम्रो छ?
4,वनप्लस वन के हो?,ओनप्लस वन के हो?,वनप्लस ४ के हुनेछ?


In [106]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="out.csv")
dataset.push_to_hub("universalml0/Nepali_embedding_dataset",token="hf_")

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/universalml0/Nepali_embedding_dataset/commit/76a62bd998f59dd7d8e6795c42f619a61d517705', commit_message='Upload dataset', commit_description='', oid='76a62bd998f59dd7d8e6795c42f619a61d517705', pr_url=None, pr_revision=None, pr_num=None)