In [31]:
import pandas as pd
from devatrans import DevaTrans

# Read the CSV file
data = pd.read_csv('./datasets/1930-39.csv')

# Print the first 5 rows of the DataFrame
print(data.head())

                                               Title           Film  Year  \
0        suquun dil ko mayassar gulosamar men nahiin  street singer  1938   
1             kamasinii men dil pe gam kaa baar kyon  amrit manthan  1934   
2  din niike biite jaate hain sumaran kar siyaara...  pooran bhagat  1933   
3                 kaahe ko mohe chhere re beimanawaa         pukaar  1939   
4                       suhaag kii raat aaii sajanii       adhikaar  1938   

                     Singer         Composer              Lyricist  \
0  saigal kanan devi saigal  rai chand boral       aarzoo lakhnawi   
1        shanta apte chorus  keshavrao bhole     veer muhammedpuri   
2                    saigal        r c boral                   NaN   
3      sardar akhtar chorus        mir sahab          kamal amrohi   
4             pahari sanyal      timir baran  munshi arzoo rasheed   

                                              Lyrics  
0    suquun dil ko mayassar gulosamar men nahiin ...  
1    k

In [32]:
# Create a DevaTrans object
dt = DevaTrans()

# Define a function for back-transliteration
def back_transliterate(text):
    # Back-transliterate the text
    devanagari_text = dt.back_transliterate(input_type="sen", from_convention="itrans", sentence=text)
    # Tokenize the text
    doc = nlp(devanagari_text)
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    return tokens

# Apply the function to the 'Lyrics' column
data['lyrics_devanagari'] = data['Lyrics'].apply(back_transliterate)

# Print the first 5 rows of the DataFrame
print(data.head())

                                               Title           Film  Year  \
0        suquun dil ko mayassar gulosamar men nahiin  street singer  1938   
1             kamasinii men dil pe gam kaa baar kyon  amrit manthan  1934   
2  din niike biite jaate hain sumaran kar siyaara...  pooran bhagat  1933   
3                 kaahe ko mohe chhere re beimanawaa         pukaar  1939   
4                       suhaag kii raat aaii sajanii       adhikaar  1938   

                     Singer         Composer              Lyricist  \
0  saigal kanan devi saigal  rai chand boral       aarzoo lakhnawi   
1        shanta apte chorus  keshavrao bhole     veer muhammedpuri   
2                    saigal        r c boral                   NaN   
3      sardar akhtar chorus        mir sahab          kamal amrohi   
4             pahari sanyal      timir baran  munshi arzoo rasheed   

                                              Lyrics  \
0    suquun dil ko mayassar gulosamar men nahiin ...   
1   

In [33]:
# Define a function for back-transliteration using the "velthuis" convention
def back_transliterate_velthuis(text):
    # Back-transliterate the text
    devanagari_text = dt.back_transliterate(input_type="sen", from_convention="velthuis", sentence=text)
    # Tokenize the text
    doc = nlp(devanagari_text)
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    return tokens

# Apply the function to the 'Lyrics' column
data['lyrics_devanagari_velthuis'] = data['Lyrics'].apply(back_transliterate_velthuis)

# Print the first 5 rows of the DataFrame
print(data[['Lyrics', 'lyrics_devanagari_velthuis']].head())

                                              Lyrics  \
0    suquun dil ko mayassar gulosamar men nahiin ...   
1    kamasinii men dil pe gam kaa baar kyon 2 vaa...   
2   aa aa  din niike biite jaate hain sumaran kar...   
3       sa \tkaahe ko mohe chhere re beimanawaa 2...   
4       suhaag kii raat aaii sajanii kaahe bhare ...   

                          lyrics_devanagari_velthuis  
0  [सुqऊन्, दिल्, को, मयस्सर्, गुलोसमर्, मेन्, नह...  
1  [कमसिनी, मेन्, दिल्, पे, गम्, का, बार्, क्योन्...  
2  [आ, आ, दिन्, नीके, बीते, जाते, हैन्, सुमरन्, क...  
3  [स, काहे, को, मोहे, छ्हेरे, रे, बेइमनwआ, २, का...  
4  [सुहाग्, की, रात्, आई, सजनी, काहे, भरे, तोरे, ...  


In [34]:
import stanza

# Download the Hindi models for stanza
stanza.download('hi')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 6.24MB/s]                    
2024-03-26 15:28:36 INFO: Downloaded file to C:\Users\rushi\stanza_resources\resources.json
2024-03-26 15:28:36 INFO: Downloading default packages for language: hi (Hindi) ...
2024-03-26 15:28:37 INFO: File exists: C:\Users\rushi\stanza_resources\hi\default.zip
2024-03-26 15:28:39 INFO: Finished downloading models and saved to C:\Users\rushi\stanza_resources


In [35]:
# Initialize the Hindi pipeline without the MWT processor
nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos,lemma')

# Define a function to extract lemmas
def extract_lemmas(text):
    # Join the tokens into a string
    text = ' '.join(text)
    # Process the text
    doc = nlp(text)
    # Extract lemmas and join them into a string
    lemmas = ' '.join(word.lemma for sent in doc.sentences for word in sent.words)
    return lemmas

# Apply the function to the 'lyrics_devanagari' column
data['lyrics_lemmatized'] = data['lyrics_devanagari'].apply(extract_lemmas)

print(data[['lyrics_devanagari', 'lyrics_lemmatized']].head())

2024-03-26 15:28:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 6.65MB/s]                    
2024-03-26 15:28:39 INFO: Downloaded file to C:\Users\rushi\stanza_resources\resources.json
2024-03-26 15:28:39 INFO: Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |

2024-03-26 15:28:39 INFO: Using device: cpu
2024-03-26 15:28:39 INFO: Loading: tokenize
2024-03-26 15:28:39 INFO: Loading: pos
2024-03-26 15:28:40 INFO: Loading: lemma
2024-03-26 15:28:40 INFO: Done loading processors!


ValueError: If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object.  Got <class 'list'>

In [None]:
# Apply the function to the 'lyrics_devanagari_velthuis' column
data['lyrics_lemmatized_velthuis'] = data['lyrics_devanagari_velthuis'].apply(extract_lemmas)

# Print the first 5 rows of the DataFrame
print(data[['lyrics_devanagari_velthuis', 'lyrics_lemmatized_velthuis']].head())

                          lyrics_devanagari_velthuis  \
0    सुqऊन् दिल् को मयस्सर् गुलोसमर् मेन् नहीन् ज...   
1    कमसिनी मेन् दिल् पे गम् का बार् क्योन् २ वा ...   
2   आ आ  दिन् नीके बीते जाते हैन् सुमरन् कर् सिया...   
3       स \tकाहे को मोहे छ्हेरे रे बेइमनwआ २ \tका...   
4       सुहाग् की रात् आई सजनी काहे भरे तोरे नैना...   

                          lyrics_lemmatized_velthuis  
0  सुqऊन् दिल् को मयस्सर् गुलोसमर् मेन् नहीन् जो ...  
1  कमसिनी मेन् दिल् पे गम् का बार् क्योन् २ वा यह...  
2  आ आ दिन् नीके बीत जा हैन् सुमरन् कर् सियाराम् ...  
3  स काहे को मोहे छ्हेरे रे बेइमनwआ २ काहे को मोह...  
4  सुहाग् का रात् आ सजनी काहे भर तोरे नैना २ सुहा...  


In [None]:
import Levenshtein as lev

# Define a function to calculate the Levenshtein distance
def calculate_change(original, lemmatized):
    # Calculate the Levenshtein distance
    distance = lev.distance(original, lemmatized)
    # Normalize the distance by the length of the original text
    normalized_distance = distance / max(len(original), 1)
    return normalized_distance

# Apply the function to the 'lyrics_devanagari' and 'lyrics_lemmatized' columns
data['lemmatization_change'] = data.apply(lambda row: calculate_change(row['lyrics_devanagari'], row['lyrics_lemmatized']), axis=1)

# Print the first 5 rows of the DataFrame
print(data[['lyrics_devanagari', 'lyrics_lemmatized', 'lemmatization_change']].head())

                                   lyrics_devanagari  \
0    सुक़ून् दिल् क्o मयस्सर् गुल्oसमर् मेन् नह्ीन...   
1    कमसिन्ी मेन् दिल् पे गम् का बार् क्य्oन् २ व...   
2   आ आ  दिन् न्ीके ब्ीते जाते हैन् सुमरन् कर् सि...   
3       स \tकाहे क्o म्oहे च्हेरे रे बेइमनwआ २ \t...   
4       सुहाग् क्ी रात् आी सजन्ी काहे भरे त्oरे न...   

                                   lyrics_lemmatized  lemmatization_change  
0  सुक़ून् दिल् क्o मयस्सर् गुल्oसमर् मेन् नह्ीन् ...              0.035230  
1  कमसिन्ी मेन् दिल् पे गम् का बार् क्य्oन् २ वा ...              0.056962  
2  आ आ दिन् न्ीके ब्ीते जा हैन् सुमरन् कर् सियारा...              0.044547  
3  स काहे क्o म्oहे च्हेरे रे बेइमनwआ २ काहे क्o ...              0.107383  
4  सुहाग् क्ी रात् आी सजन्ी काहे भर त्oरे नैना २ ...              0.023026  
