In [16]:


!pip install -U spacy==3.6.1
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting spacy==3.6.1
  Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting thinc<8.2.0,>=8.1.8 (from spacy==3.6.1)
  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting typer<0.10.0,>=0.3.0 (from spacy==3.6.1)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Collecting pathy>=0.10.0 (from spacy==3.6.1)
  Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)
Collecting smart-open<7.0.0,>=5.2.1 (from spacy==3.6.1)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Collecting pathlib-abc==0.1.1 (from pathy>=0.10.0->spacy==3.6.1)
  Downloading pathlib_abc-0.1.1-py3-none-any.whl.metadata (18 kB)
Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pathy-0.11.0-py3-none-a

In [2]:
import pandas as pd
import re
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


‣ Step 1: Import and merge all the three datasets.

In [3]:
# loading commoncrawl datasets for both german and english
with open('/content/drive/MyDrive/commoncrawl_de_en.txt', 'r', encoding='utf-8') as file:
    data = [line.strip() for line in file]

commoncrawl_de = pd.DataFrame(data, columns=['german'])

display("The shape of commoncrawl_de_en dataset : ", commoncrawl_de.shape)


with open('/content/drive/MyDrive/commoncrawl_en_de.txt', 'r', encoding='utf-8') as file:
    data = [line.strip() for line in file]

commoncrawl_en = pd.DataFrame(data, columns=['english'])

display("The shape of commoncrawl_en_de dataset : ", commoncrawl_en.shape)


'The shape of commoncrawl_de_en dataset : '

(2399123, 1)

'The shape of commoncrawl_en_de dataset : '

(2399123, 1)

In [4]:
# loading europarl datasets for both german and english

with open('/content/drive/MyDrive/europarl-v7_de_en.txt', 'r', encoding='utf-8') as file:
    data = [line.strip() for line in file]

europarl_de = pd.DataFrame(data, columns=['german'])

display("The shape of europarl-v7_de_en dataset : ", europarl_de.shape)


with open('/content/drive/MyDrive/europarl-v7_en_de.txt', 'r', encoding='utf-8') as file:
    data = [line.strip() for line in file]

europarl_en = pd.DataFrame(data, columns=['english'])

display("The shape of europarl-v7_en_de dataset : ", europarl_en.shape)


'The shape of europarl-v7_de_en dataset : '

(1920209, 1)

'The shape of europarl-v7_en_de dataset : '

(1920209, 1)

In [5]:
# loading news commentary for both german and english

with open('/content/drive/MyDrive/news-commentary-v9_de_en.txt', 'r', encoding='utf-8') as file:
    data = [line.strip() for line in file]

news_de = pd.DataFrame(data, columns=['german'])

display("The shape of news-commentary-v9_de_en dataset : ", news_de.shape)


with open('/content/drive/MyDrive/news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
    data = [line.strip() for line in file]

news_en = pd.DataFrame(data, columns=['english'])

display("The shape of news-commentary-v9_en_de dataset : ", news_en.shape)

'The shape of news-commentary-v9_de_en dataset : '

(201854, 1)

'The shape of news-commentary-v9_en_de dataset : '

(201995, 1)

Here we could see there some additional rows for the news commentary english dataset

In [6]:
# Calculate individual sums as before
commoncrawl_de_rows = commoncrawl_de['german'].notna().sum()
commoncrawl_en_rows = commoncrawl_en['english'].notna().sum()
europarl_de_rows = europarl_de['german'].notna().sum()
europarl_en_rows = europarl_en['english'].notna().sum()
news_de_rows = news_de['german'].notna().sum()
news_en_rows = news_en['english'].notna().sum()

# Calculate total sums
total_de_rows = commoncrawl_de_rows + europarl_de_rows + news_de_rows
total_en_rows = commoncrawl_en_rows + europarl_en_rows + news_en_rows

# Print the total sums
print(f"Total German rows before merging: {total_de_rows}")
print(f"Total English rows before merging: {total_en_rows}")

Total German rows before merging: 4521186
Total English rows before merging: 4521327


Step 2: Data cleansing

In [7]:
def load_and_clean_data(de_file_path, en_file_path, clean=False):
    """Loads and optionally cleans data from two files (German and English)."""

    def is_special_characters(line):
        return re.match(r'^[^a-zA-Z0-9\s]+$', line) is not None

    def clean_lines(file_path):
        cleaned_data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                cleaned_line = line.strip()
                if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
                    cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
                    cleaned_data.append(cleaned_line)
        return cleaned_data

    de_data = clean_lines(de_file_path) if clean else [line.strip() for line in open(de_file_path, 'r', encoding='utf-8')]
    en_data = clean_lines(en_file_path) if clean else [line.strip() for line in open(en_file_path, 'r', encoding='utf-8')]

    de_df = pd.DataFrame(de_data, columns=['german'])
    en_df = pd.DataFrame(en_data, columns=['english'])

    return de_df.join(en_df)

# Load and process the datasets
commoncrawl = load_and_clean_data('/content/drive/MyDrive/commoncrawl_de_en.txt', '/content/drive/MyDrive/commoncrawl_en_de.txt')
europarl = load_and_clean_data('/content/drive/MyDrive/europarl-v7_de_en.txt', '/content/drive/MyDrive/europarl-v7_en_de.txt')
news = load_and_clean_data('/content/drive/MyDrive/news-commentary-v9_de_en.txt', '/content/drive/MyDrive/news-commentary-v9_en_de.txt', clean=True)

# Concatenate them into a single DataFrame
merged_df = pd.concat([commoncrawl, europarl, news], ignore_index=True)

display(merged_df)

Unnamed: 0,german,english
0,"iron cement ist eine gebrauchs-fertige Paste, ...",iron cement is a ready for use paste which is ...
1,Nach der Aushärtung schützt iron cement die Ko...,iron cement protects the ingot against the hot...
2,feuerfester Reparaturkitt für Feuerungsanlagen...,"a fire restant repair cement for fire places, ..."
3,Der Bau und die Reparatur der Autostraßen...,Construction and repair of highways and...
4,die Mitteilungen sollen den geschäftlichen kom...,An announcement must be commercial character.
...,...,...
4520860,Das bleibt eine der größten Errungenschaften i...,He’s secure enough to dance and sing in public...
4520861,Gleichzeitig scheint sich Zumas revolutionäre ...,In contrast to his two predecessors – the sain...
4520862,"In einer Region, wo die älteren Menschen sehr ...","Until now, populism has been the missing note ..."
4520863,Drei von zehn Südafrikanern sind jünger als 15...,"Zuma, who spent his youth herding cattle and o..."


In [8]:
# Calculate individual sums as after merging
commoncrawl_de_rows = commoncrawl['german'].notna().sum()
commoncrawl_en_rows = commoncrawl['english'].notna().sum()
europarl_de_rows = europarl['german'].notna().sum()
europarl_en_rows = europarl['english'].notna().sum()
news_de_rows = news['german'].notna().sum()
news_en_rows = news['english'].notna().sum()

# Calculate total sums
total_de_rows = commoncrawl_de_rows + europarl_de_rows + news_de_rows
total_en_rows = commoncrawl_en_rows + europarl_en_rows + news_en_rows

# Print the total sums
print(f"Total German rows before merging: {total_de_rows}")
print(f"Total English rows before merging: {total_en_rows}")

Total German rows before merging: 4520865
Total English rows before merging: 4520865


‣ Step 3: NLP pre processing - Dataset suitable to be used for AIML model learning

After Cleaning the data by removing extra spaces or numbers as newlines we were able to make the row count same

In [9]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  text = re.sub(r'\d+', '', text)
  text = re.sub(r'\s+', ' ', text)
  return text

merged_df['german'] = merged_df['german'].apply(clean_text)
merged_df['english'] = merged_df['english'].apply(clean_text)

In [10]:
display(merged_df)

Unnamed: 0,german,english
0,iron cement ist eine gebrauchsfertige paste di...,iron cement is a ready for use paste which is ...
1,nach der aushärtung schützt iron cement die ko...,iron cement protects the ingot against the hot...
2,feuerfester reparaturkitt für feuerungsanlagen...,a fire restant repair cement for fire places o...
3,der bau und die reparatur der autostraßen,construction and repair of highways and
4,die mitteilungen sollen den geschäftlichen kom...,an announcement must be commercial character
...,...,...
4520860,das bleibt eine der größten errungenschaften i...,hes secure enough to dance and sing in public ...
4520861,gleichzeitig scheint sich zumas revolutionäre ...,in contrast to his two predecessors the saintl...
4520862,in einer region wo die älteren menschen sehr v...,until now populism has been the missing note i...
4520863,drei von zehn südafrikanern sind jünger als un...,zuma who spent his youth herding cattle and on...


In [11]:
merged_df.to_parquet('/content/drive/MyDrive/merged_df.parquet', index=False)


In [12]:
import nltk
# Download the 'punkt_tab' resource
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Install necessary libraries
!pip install transformers tokenizers nltk pyarrow




In [3]:
import pandas as pd
import torch
from transformers import AutoTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
from multiprocessing import Pool

# Download stopwords
nltk.download("stopwords")
german_stopwords = set(stopwords.words("german"))
english_stopwords = set(stopwords.words("english"))

# Load the Parquet file
file_path = "/content/drive/MyDrive/merged_df.parquet"  # Replace with your file path
data = pd.read_parquet(file_path)

# def preprocess_text(text, language):
#     text = text.lower()  # Lowercase
#     stopwords_set = english_stopwords if language == "english" else german_stopwords
#     text = " ".join(word for word in text.split() if word not in stopwords_set)  # Remove stopwords
#     return text

# # Wrapper function for German
# def preprocess_german(text):
#     return preprocess_text(text, language="german")

# # Wrapper function for English
# def preprocess_english(text):
#     return preprocess_text(text, language="english")

# # Parallelize preprocessing
# def parallel_apply(func, data, workers=4):
#     with Pool(workers) as p:
#         return list(tqdm(p.imap(func, data), total=len(data)))

# # Apply preprocessing in parallel
# data['german'] = parallel_apply(preprocess_german, data['german'], workers=4)
# data['english'] = parallel_apply(preprocess_english, data['english'], workers=4)

# # Initialize tokenizers
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

# german_tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
# english_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# # Tokenize in batches with GPU
# def tokenize_column_with_gpu(texts, tokenizer, batch_size=10000):
#     tokenized_output = []
#     for batch in tqdm(range(0, len(texts), batch_size)):
#         batch_texts = texts[batch:batch + batch_size].tolist()
#         # Move tensors to GPU for tokenization
#         tokens = tokenizer(
#             batch_texts,
#             padding=True,
#             truncation=True,
#             return_tensors="pt"
#         ).input_ids.to(device)  # Send to GPU
#         tokenized_output.extend(tokens.cpu().tolist())  # Move back to CPU for storage
#     return tokenized_output

# # Tokenize both columns
# data['german_tokens'] = tokenize_column_with_gpu(data['german'], german_tokenizer)
# data['english_tokens'] = tokenize_column_with_gpu(data['english'], english_tokenizer)

# # Save the tokenized data back to Parquet
# data.to_parquet("tokenized_output_with_gpu.parquet", index=False)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

german_tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased", use_fast=True)
english_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

# Tokenization function
def fast_tokenize_column(texts, tokenizer, batch_size=50000):
    tokenized_output = []
    for start in tqdm(range(0, len(texts), batch_size), desc="Tokenizing"):
        batch_texts = texts[start:start + batch_size].tolist()
        tokens = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).input_ids.to(device)
        tokenized_output.extend(tokens.cpu().tolist())
    return tokenized_output

# Tokenize columns
data['german_tokens'] = fast_tokenize_column(data['german'], german_tokenizer)
data['english_tokens'] = fast_tokenize_column(data['english'], english_tokenizer)

# Save the results
output_path = "/content/drive/MyDrive/fast_tokenized_output.parquet"
data.to_parquet(output_path, index=False)
print(f"Tokenized data saved to {output_path}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing: 100%|██████████| 91/91 [30:07<00:00, 19.86s/it]
Tokenizing: 100%|██████████| 91/91 [27:43<00:00, 18.28s/it]


Tokenized data saved to /content/drive/MyDrive/fast_tokenized_output.parquet


In [4]:
# import pandas as pd
# data = pd.read_parquet("/content/drive/MyDrive/fast_tokenized_output.parquet")

# # Inspect the first few rows
# print(data.head())

                                              german  \
0  iron cement ist eine gebrauchsfertige paste di...   
1  nach der aushärtung schützt iron cement die ko...   
2  feuerfester reparaturkitt für feuerungsanlagen...   
3          der bau und die reparatur der autostraßen   
4  die mitteilungen sollen den geschäftlichen kom...   

                                             english  \
0  iron cement is a ready for use paste which is ...   
1  iron cement protects the ingot against the hot...   
2  a fire restant repair cement for fire places o...   
3            construction and repair of highways and   
4       an announcement must be commercial character   

                                       german_tokens  \
0  [3, 25492, 1350, 3145, 127, 155, 23963, 4174, ...   
1  [3, 188, 21, 147, 613, 16276, 23565, 25492, 13...   
2  [3, 6224, 667, 23428, 26900, 22106, 851, 4824,...   
3  [3, 21, 3703, 42, 30, 22106, 851, 21, 4874, 12...   
4  [3, 30, 23048, 11661, 7, 1922, 86, 19108, 2

In [10]:
import dask.dataframe as dd

# Load the Parquet file as a Dask DataFrame
data = dd.read_parquet("/content/drive/MyDrive/fast_tokenized_output.parquet")

# Inspect the first few rows
print(data.head())

                                              german  \
0  iron cement ist eine gebrauchsfertige paste di...   
1  nach der aushärtung schützt iron cement die ko...   
2  feuerfester reparaturkitt für feuerungsanlagen...   
3          der bau und die reparatur der autostraßen   
4  die mitteilungen sollen den geschäftlichen kom...   

                                             english  \
0  iron cement is a ready for use paste which is ...   
1  iron cement protects the ingot against the hot...   
2  a fire restant repair cement for fire places o...   
3            construction and repair of highways and   
4       an announcement must be commercial character   

                                       german_tokens  \
0  [3, 25492, 1350, 3145, 127, 155, 23963, 4174, ...   
1  [3, 188, 21, 147, 613, 16276, 23565, 25492, 13...   
2  [3, 6224, 667, 23428, 26900, 22106, 851, 4824,...   
3  [3, 21, 3703, 42, 30, 22106, 851, 21, 4874, 12...   
4  [3, 30, 23048, 11661, 7, 1922, 86, 19108, 2

In [2]:
print(data.tail())

                                                   german  \
326556  das bleibt eine der größten errungenschaften i...   
326557  gleichzeitig scheint sich zumas revolutionäre ...   
326558  in einer region wo die älteren menschen sehr v...   
326559  drei von zehn südafrikanern sind jünger als un...   
326560  irgendwie muss zuma einen weg finden einerseit...   

                                                  english  \
326556  hes secure enough to dance and sing in public ...   
326557  in contrast to his two predecessors the saintl...   
326558  until now populism has been the missing note i...   
326559  zuma who spent his youth herding cattle and on...   
326560  yet while zumas populist appeal reflects south...   

                                            german_tokens  \
326556  [3, 93, 3141, 155, 21, 3261, 67, 18387, 972, 5...   
326557  [3, 4705, 4986, 144, 260, 45, 20338, 1170, 747...   
326558  [3, 50, 225, 7912, 743, 30, 8365, 22311, 1120,...   
326559  [3, 678, 88, 1

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import dask.array as da

german_tokens_dask = data['german_tokens'].values
english_tokens_dask = data['english_tokens'].values

# 2. Compute the dask arrays to get numpy arrays.
german_tokens = da.compute(german_tokens_dask)
english_tokens = da.compute(english_tokens_dask)

max_length = 512  # Adjust as needed
german_tokens = pad_sequences(german_tokens, maxlen=max_length, padding='post', dtype='object')
english_tokens = pad_sequences(english_tokens, maxlen=max_length, padding='post', dtype='object')


# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(german_tokens, english_tokens, test_size=0.2, random_state=42)

# Convert to TensorFlow tensors with appropriate dtype
X_train = tf.convert_to_tensor(X_train, dtype=tf.int32)
X_val = tf.convert_to_tensor(X_val, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_val = tf.convert_to_tensor(y_val, dtype=tf.int32)

# Check GPU availability and explicitly assign device
device_name = tf.test.gpu_device_name()
if device_name:
    print(f"Found GPU at: {device_name}")
else:
    print("No GPU found, using CPU instead.")

# Define model parameters (adjust as needed)
embedding_dim = 64
rnn_units = 128
vocab_size = 30522

# Build and compile the model within the device scope
with tf.device(device_name):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        tf.keras.layers.SimpleRNN(rnn_units, return_sequences=False),
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate within the device scope
with tf.device(device_name):
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=64,  # Adjust as needed
        epochs=5       # Adjust as needed
    )

