In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://www.learnentry.com/english-to-swahili/swahili-sentences-and-phrases/"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

english_sentences = []
swahili_sentences = []

for row in soup.find_all('tr')[1:]:
    cols = row.find_all('td')
    if len(cols) == 2:  # Add this condition to check if there are two columns in the row
        english_sentence = cols[0].get_text().strip()
        swahili_sentence = cols[1].get_text().strip()
        english_sentences.append(english_sentence)
        swahili_sentences.append(swahili_sentence)
    
print("English Sentences:\n", english_sentences)
print("\nSwahili Sentences:\n", swahili_sentences)


English Sentences:
 ['Thanks', 'Good', 'Enjoy', 'Fine', 'Congratulations', 'I hate you', 'I love you', 'I’m in love', 'I’m sorry', 'I’m so sorry', 'I’m yours', 'Thanks again', 'How are you', 'I am fine', 'Take care', 'I miss you', 'You’re nice', 'That’s terrible', 'That’s too bad', 'That’s too much', 'See you', 'Thank you', 'Thank you sir', 'Are you free', 'No problem', 'Get well soon', 'Very good', 'Well done', 'What’s up', 'I can’t hear you', 'I can’t stop', 'I know', 'Good bye', 'Good idea', 'Good luck', 'You are late', 'Who is next?', 'Who is she?', 'Who is that man?', 'Who built it?', 'They hurt', 'She got angry', 'She is a teacher', 'She is aggressive', 'She is attractive', 'She is beautiful', 'She is crying', 'She is happy', 'No way!', 'No worries', 'No, thank you', 'I’m so happy', 'I’m hungry', 'I’m able to run', 'I agree', 'I can swim', 'I can’t come', 'He got angry', 'He was alone', 'He was brave', 'He likes to swim', 'Don’t be angry', 'Don’t be sad', 'Don’t cry', 'Come in', 

In [2]:
#code after preprocessing which involves converting to lower case
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.learnentry.com/english-to-swahili/swahili-sentences-and-phrases/"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

english_sentences = []
swahili_sentences = []

for row in soup.find_all('tr')[1:]:
    cols = row.find_all('td')
    if len(cols) == 2:
        english_sentence = cols[0].get_text().strip()
        swahili_sentence = cols[1].get_text().strip()
        english_sentences.append(english_sentence)
        swahili_sentences.append(swahili_sentence)
    
data = {'English': english_sentences, 'Swahili': swahili_sentences}
df = pd.DataFrame(data)

# Preprocessing the data
df['English'] = df['English'].apply(lambda x: x.lower())
df['Swahili'] = df['Swahili'].apply(lambda x: x.lower())

# Save to CSV file
df.to_csv('sentences.csv', index=False)


In [3]:
print (df)

             English   Swahili
0             thanks    asante
1               good     nzuri
2              enjoy  furahiya
3               fine     faini
4    congratulations   hongera
..               ...       ...
403            panic  wasiwasi
404            thank    asante
405           desire      hamu
406            woman  mwanamke
407           hungry      njaa

[408 rows x 2 columns]


In [4]:
#In this updated code, I have added the TfidfVectorizer class from scikit-learn library to extract features using the TF-IDF technique
#updated code with feature extraction
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

url = "https://www.learnentry.com/english-to-swahili/swahili-sentences-and-phrases/"

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

english_sentences = []
swahili_sentences = []

for row in soup.find_all('tr')[1:]:
    cols = row.find_all('td')
    if len(cols) == 2:
        english_sentence = cols[0].get_text().strip()
        swahili_sentence = cols[1].get_text().strip()
        english_sentences.append(english_sentence)
        swahili_sentences.append(swahili_sentence)
    
data = {'English': english_sentences, 'Swahili': swahili_sentences}
df = pd.DataFrame(data)

# Preprocessing the data
df['English'] = df['English'].apply(lambda x: x.lower())
df['Swahili'] = df['Swahili'].apply(lambda x: x.lower())

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer()
X_english = tfidf.fit_transform(df['English'])
X_swahili = tfidf.fit_transform(df['Swahili'])

print("English Features:\n", X_english)
print("\nSwahili Features:\n", X_swahili)


English Features:
   (0, 331)	1.0
  (1, 140)	1.0
  (2, 103)	1.0
  (3, 121)	1.0
  (4, 65)	1.0
  (5, 385)	0.31845540198385114
  (5, 152)	0.9479378444535822
  (6, 195)	0.9084838108765929
  (6, 385)	0.41792004662990595
  (7, 168)	0.6939574514539055
  (7, 195)	0.7200160106355972
  (8, 301)	1.0
  (9, 295)	0.6571121287869782
  (9, 301)	0.7537928430285379
  (10, 387)	1.0
  (11, 7)	0.7822185632371257
  (11, 331)	0.6230041085957995
  (12, 21)	0.55344351637931
  (12, 162)	0.7566640936834965
  (12, 385)	0.34808005330360026
  (13, 14)	0.6618173825605836
  (13, 121)	0.7496650933187821
  (14, 53)	0.7071067811865475
  (14, 317)	0.7071067811865475
  (15, 208)	0.9356120976397049
  :	:
  (389, 189)	0.533576224263735
  (389, 60)	0.463120675918879
  (389, 341)	0.4160516395147877
  (389, 385)	0.2352288682684165
  (390, 302)	0.5267466893624159
  (390, 93)	0.48584680124440194
  (390, 352)	0.4053001036775977
  (390, 362)	0.5676465774804298
  (391, 101)	1.0
  (392, 11)	1.0
  (393, 227)	1.0
  (394, 294)	1.0
  (3

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the data
url = "https://www.learnentry.com/english-to-swahili/swahili-sentences-and-phrases/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
english_sentences = []
swahili_sentences = []
for row in soup.find_all('tr')[1:]:
    cols = row.find_all('td')
    if len(cols) == 2:
        english_sentence = cols[0].get_text().strip()
        swahili_sentence = cols[1].get_text().strip()
        english_sentences.append(english_sentence)
        swahili_sentences.append(swahili_sentence)
data = {'English': english_sentences, 'Swahili': swahili_sentences}
df = pd.DataFrame(data)

# Preprocessing the data
df['English'] = df['English'].apply(lambda x: x.lower())
df['Swahili'] = df['Swahili'].apply(lambda x: x.lower())

# Split the data into train and test sets
train_sentences, test_sentences, train_targets, test_targets = train_test_split(df['English'], df['Swahili'], test_size=0.2, random_state=42)

# Tokenize the input and output sequences
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_sentences)
train_sequences = input_tokenizer.texts_to_sequences(train_sentences)
test_sequences = input_tokenizer.texts_to_sequences(test_sentences)

output_tokenizer = Tokenizer()
output_tokenizer.fit_on_texts(train_targets)
train_targets_sequences = output_tokenizer.texts_to_sequences(train_targets)
test_targets_sequences = output_tokenizer.texts_to_sequences(test_targets)

# Get the maximum sequence length for padding
max_seq_len_input = max(len(sequence) for sequence in train_sequences)
max_seq_len_output = max(len(sequence) for sequence in train_targets_sequences)
max_seq_len = max(max_seq_len_input, max_seq_len_output)

# Pad the input and output sequences
train_sequences_padded = pad_sequences(train_sequences, maxlen=max_seq_len, padding='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_seq_len, padding='post')

train_targets_sequences_padded = pad_sequences(train_targets_sequences, maxlen=max_seq_len, padding='post')
test_targets_sequences_padded = pad_sequences(test_targets_sequences, maxlen=max_seq_len, padding='post')

# Convert the output sequences to one-hot encoding
train_targets_onehot = to_categorical(train_targets_sequences_padded, num_classes=len(output_tokenizer.word_index)+1)
test_targets_onehot = to_categorical(test_targets_sequences_padded, num_classes=len(output_tokenizer.word_index)+1)

#Define the model architecture
input_shape = (max_seq_len,)
output_shape = (max_seq_len, len(output_tokenizer.word_index)+1)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=len(input_tokenizer.word_index)+1, output_dim=256)(input_layer)
lstm_layer = LSTM(256, return_sequences=True)(embedding_layer)
output_layer = Dense(len(output_tokenizer.word_index)+1, activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)

#Compile the model
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

#Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
history = model.fit(train_sequences_padded, train_targets_onehot, validation_data=(test_sequences_padded, test_targets_onehot), batch_size=128, epochs=100, callbacks=[early_stopping])

#Evaluate the model on the test set
loss, accuracy = model.evaluate(test_sequences_padded, test_targets_onehot)
print("Test set accuracy: {:.2f}%".format(accuracy * 100))



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Test set accuracy: 81.71%
