In [1]:
import pandas as pd
import random

In [None]:
subjects = [
    ("I", "मैं", "ਮੈਂ"),
    ("You", "तुम", "ਤੂੰ"),
    ("He", "वह", "ਉਹ"),
    ("She", "वह", "ਉਹ"),
    ("We", "हम", "ਅਸੀਂ"),
    ("They", "वे", "ਉਹਨਾਂ"),
    ("My father", "मेरे पिता", "ਮੇਰੇ ਪਿਤਾ"),
    ("My mother", "मेरी माँ", "ਮੇਰੀ ਮਾਂ"),
    ("The boy", "लड़का", "ਲੜਕਾ"),
    ("The girl", "लड़की", "ਲੜਕੀ"),
    ("The teacher", "शिक्षक", "ਅਧਿਆਪਕ"),
    ("The student", "विद्यार्थी", "ਵਿਦਿਆਰਥੀ"),
    ("The doctor", "डॉक्टर", "ਡਾਕਟਰ"),
    ("The farmer", "किसान", "ਕਿਸਾਨ"),
]

verbs = [
    ("eat", "खाता है", "ਖਾਂਦਾ ਹੈ"),
    ("go", "जाता है", "ਜਾਂਦਾ ਹੈ"),
    ("play", "खेलता है", "ਖੇਡਦਾ ਹੈ"),
    ("read", "पढ़ता है", "ਪੜ੍ਹਦਾ ਹੈ"),
    ("write", "लिखता है", "ਲਿਖਦਾ ਹੈ"),
    ("drink", "पीता है", "ਪੀਦਾ ਹੈ"),
    ("watch", "देखता है", "ਵੇਖਦਾ ਹੈ"),
    ("buy", "खरीदता है", "ਖਰੀਦਦਾ ਹੈ"),
    ("sell", "बेचता है", "ਵੇਚਦਾ ਹੈ"),
    ("cook", "पकाता है", "ਪਕਾਉਂਦਾ ਹੈ"),
    ("clean", "साफ करता है", "ਸਾਫ ਕਰਦਾ ਹੈ"),
    ("teach", "सिखाता है", "ਸਿਖਾਉਂਦਾ ਹੈ"),
    ("learn", "सीखता है", "ਸਿੱਖਦਾ ਹੈ"),
    ("travel", "यात्रा करता है", "ਯਾਤਰਾ ਕਰਦਾ ਹੈ"),
]

objects = [
    ("food", "खाना", "ਖਾਣਾ"),
    ("school", "स्कूल", "ਸਕੂਲ"),
    ("book", "किताब", "ਕਿਤਾਬ"),
    ("water", "पानी", "ਪਾਣੀ"),
    ("tea", "चाय", "ਚਾਹ"),
    ("home", "घर", "ਘਰ"),
    ("market", "बाज़ार", "ਬਾਜ਼ਾਰ"),
    ("car", "गाड़ी", "ਗੱਡੀ"),
    ("computer", "कंप्यूटर", "ਕੰਪਿਊਟਰ"),
    ("mobile", "मोबाइल", "ਮੋਬਾਈਲ"),
    ("movie", "फ़िल्म", "ਫਿਲਮ"),
    ("music", "संगीत", "ਸੰਗੀਤ"),
    ("game", "खेल", "ਖੇਡ"),
    ("newspaper", "अखबार", "ਅਖਬਾਰ"),
    ("letter", "पत्र", "ਪੱਤਰ"),
]

In [3]:
def generate_sentence():
    s = random.choice(subjects)
    v = random.choice(verbs)
    o = random.choice(objects)

    en = f"{s[0]} {v[0]} {o[0]}"
    hi = f"{s[1]} {o[1]} {v[1]}"
    pa = f"{s[2]} {o[2]} {v[2]}"

    return en, hi, pa

In [4]:
data = []

for _ in range(20000):   # Increase if needed
    en, hi, pa = generate_sentence()
    data.append((en, hi, pa))

df = pd.DataFrame(data, columns=["english", "hindi", "punjabi"])

df.head()

Unnamed: 0,english,hindi,punjabi
0,We play water,हम पानी खेलता हूँ,ਅਸੀਂ ਪਾਣੀ ਖੇਡਦਾ ਹਾਂ
1,He play school,वह स्कूल खेलता हूँ,ਉਹ ਸਕੂਲ ਖੇਡਦਾ ਹਾਂ
2,I play book,मैं किताब खेलता हूँ,ਮੈਂ ਕਿਤਾਬ ਖੇਡਦਾ ਹਾਂ
3,They go school,वे स्कूल जाता हूँ,ਉਹਨਾਂ ਸਕੂਲ ਜਾਂਦਾ ਹਾਂ
4,They read home,वे घर पढ़ता हूँ,ਉਹਨਾਂ ਘਰ ਪੜ੍ਹਦਾ ਹਾਂ


In [5]:
training_data = []

for _, row in df.iterrows():

    # English → Hindi
    training_data.append(("<en> <to_hi> " + row["english"],
                          "<start> " + row["hindi"] + " <end>"))

    # English → Punjabi
    training_data.append(("<en> <to_pa> " + row["english"],
                          "<start> " + row["punjabi"] + " <end>"))

    # Hindi → English
    training_data.append(("<hi> <to_en> " + row["hindi"],
                          "<start> " + row["english"] + " <end>"))

    # Punjabi → English
    training_data.append(("<pa> <to_en> " + row["punjabi"],
                          "<start> " + row["english"] + " <end>"))

    # Hindi → Punjabi
    training_data.append(("<hi> <to_pa> " + row["hindi"],
                          "<start> " + row["punjabi"] + " <end>"))

    # Punjabi → Hindi
    training_data.append(("<pa> <to_hi> " + row["punjabi"],
                          "<start> " + row["hindi"] + " <end>"))