In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error
import numpy as np

# Step 1: Load the dataset
data = pd.read_csv("/content/modified_ielts_writing_dataset .csv")


# Step 2: Prepare the Data
X = data['Question'] + " " + data['Essay']
y = data['Overall']  # Assuming 'overall' is the target variable

# Step 3: Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=500)  # maxlen is the length of the longest sequence

# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Step 5: Define the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=500))
model.add(LSTM(64))
model.add(Dense(1, activation='linear'))  # Linear activation function for regression task




In [None]:
# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Step 6: Train the Model
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=[early_stopping])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


<keras.callbacks.History at 0x7d662c913be0>

In [None]:
# Step 7: Evaluate the Model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)



Mean Absolute Error: 0.8107861791338239


## Testing question and answer in the dataset

In [None]:
new_question = "The bar chart below describes some changes about the percentage of people were born in Australia and who were born outside Australia living in urban, rural and town between 1995 and 2010.Summarise the information by selecting and reporting the main features and make comparisons where relevant."
user_answer = "Between 1995 and 2010, a study was conducted representing the percentages of people born in Australia, versus people born outside Australia, living in urban, rural, and town. First, in 1995, cities represented the major percentage of habitat by roughly 50 percent, followed by rural areas and towns came in last, among people born in Australia. On the other hand, people born outside Australia, cities showed the most percentages of 60 percent, followed by rural areas and towns. In 2010, among people born in Australia, cities had an increase more than 20 percent increase in the total representation and a major decrease in towns and rural areas. Conversely, people born outside Australia, cities had the most percentage among both studies, followed by rural areas and towns."

new_input_text = new_question + " " + user_answer

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])

Predicted Score: 5.6314654


## Testing question and answer in the dataset

In [None]:
new_question = "The bar chart below describes some changes about the percentage of people were born in Australia and who were born outside Australia living in urban, rural and town between 1995 and 2010.Summarise the information by selecting and reporting the main features and make comparisons where relevant."
user_answer = "The left chart shows the population change happened in Austrilia from 1995 to 2010. In 1995, half of the people born in australia are from cities, 30% from rural areas and only 20% are from towns. For the people outside of Australia, most of the people still born in cities, which is around 60%. but the number of rural areas increased to 40% with the towns born rate decreased to only 10%.In 2010, The people born in cities increased significianly in both in and outside Australia, especially in outside Australia, which reached 80%. The people bore in towns decreased simutanuously, to around 17% of the people born in Australia and 10% of outside Australia respectively. The most significiant change happened at rual areas numbers. It has shrinked to 17% of people born in Australia, and only around 5% of peopel bore outside Australia.Overall, the chart shows us the trend that many people moved to Cities from rual area in the past 15 years."

new_input_text = new_question + " " + user_answer

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])

Predicted Score: 6.5379605


## Testing question and answer in the dataset

In [None]:
new_question = "Some countries achieve international sports by building specialised facilities to train top athletes, instead of providing sports facilities that everyone can use. Do you think this is positive or negative development?Discuss both views and give your opinion."
user_answer = "Athletes to be on the top need to have high-end facilities that a country can provide for them to be able to improve their strengths and skills. Some government opted to build facilities exclusively for training athletes for international sports goals, instead of building establishments for everyone to have the liberty of using the sports facilities. From my perspective, this phenomena is a negative development.Being able to discover an individual's potential, they need to have a support from any form of sources. Establishing sports facilities can help people to discover their own interests, talents and even skills in learning new things. Centers which offer utilities, equipments and state-of-the-art tools that provide learning sports can be a great advantage to people who want to train themselves on the things that can awaken their interests when it comes to sports.In addition, building these kind of establishments, not only for the purpose to train top athletes, but also for people to immerse themselves to different variaties of activities can help them socialize more with their peers. This is extremely beneficial that aside from doing sports that they are into, they get to play along with peers, discover one's potential and also learning from one another. Furthermore, these individuals can also have the future to be one of top athletes that a country can have.In conclusion, although there are countries that chose to have their sports establishments exclusively for training top athletes, I assert that these that actions can be lead to negative feedbacks. Government should allow everyone the privilege to use and avail facilities that can hone an individual's talents, skills and dexterity when it comes to sports and this can also be immensely benefical for both country and citizen alike."

new_input_text = new_question + " " + user_answer

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])

Predicted Score: 5.521077


##Testing question in the dataset and user's answer

In [None]:
new_question = "Some countries achieve international sports by building specialised facilities to train top athletes, instead of providing sports facilities that everyone can use. Do you think this is positive or negative development?Discuss both views and give your opinion."
user_answer = "Constructing specialized sports facilities to train elite athletes raises concerns regarding equity and access for the broader population. While such investments may enhance a country's international sporting success, prioritizing elite training facilities over public access to sports facilities may exacerbate inequalities and limit opportunities for participation in sports among the general populace."

new_input_text = new_question + " " + user_answer

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])

Predicted Score: 6.5802865


## Testing question in the dataset and user's answer

In [None]:
new_question = "The bar chart below describes some changes about the percentage of people were born in Australia and who were born outside Australia living in urban, rural and town between 1995 and 2010.Summarise the information by selecting and reporting the main features and make comparisons where relevant."
user_answer = "The bar chart illustrates shifts in the residential distribution of individuals born in Australia and abroad across urban, rural, and town areas from 1995 to 2010. Key observations reveal urban habitation predominance, notably increasing for Australian-born citizens. Conversely, rural and town residence declines are evident over the studied period."

new_input_text = new_question + " " + user_answer

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])





Predicted Score: 6.914455


## Testing question in the dataset and user's answer

## Testing question and answer in the dataset

In [None]:
new_question = "Nowadays, not enough students choose science subjects in university in many countries. What are the reasons for this problem? What are the effects on society?"
user_answer = "In recent years, many countries have reported a decrease in the number of students choosing science subjects in universities. This is a concerning trend that can have a profound impact on society. In this essay, I will discuss the reasons behind this problem and its implications.The first reason for fewer students selecting science subjects is likely that these courses are difficult to comprehend. In general, science courses such as mathematics and physics involve complex theories, formulas, and concepts that require a great deal of hard work to understand. Furthermore, they are often taught in a theoretical manner and fail to show students how this knowledge can be used in real life. In other words, it may seem impractical for them to learn science courses. Lastly, it may be hard to acquire a practical job through these theoretical subjects in the future. As a result, students may give up studying science because they find scientific studies unappealing, impractical, and not promising for their future careers.Consequently, the lack of students studying science subjects can have a significant impact on society. First and foremost, there may not be enough qualified scientists and teachers to teach the theoretical foundation of technology, medicine, and engineering. This could lead to slow progress in these areas, and society may suffer as a result. Moreover, the decline in the number of science students could lead to a shortage of scientists for research and development in industry, which would have a negative impact on the nation's economy.To sum up, the decreasing number of students selecting science courses in universities is a worrying phenomenon. The main reasons for this are the difficulty of science courses and the lack of job opportunities in science after studies. This could have a negative impact on society, as there may not be enough qualified scientists to pursue research, and this could impede progress in fields such as technology and medicine."

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])

Predicted Score: 7.1203666


## Testing the question in the dataset and giving the answer that does not match this question in the dataset

In [None]:
new_question = "Nowadays, not enough students choose science subjects in university in many countries. What are the reasons for this problem? What are the effects on society?"
user_answer = "The two diagrams compare the proportion of time spent by adult workers in a specific country on different activities over a 50-year period, from 1958 to 2008.Overall, it can clearly be seen that in 1958, adult workers divided their time roughly equally between work, sleep, and other activities. However, over the next five decades, there was a remarkable increase in working hours, a significant decrease in sleep time, and the total time allocated to other activities remained nearly unchanged.It is surprising that adult workers spent almost the same amount of time sleeping (32%) as they did working (33%) in 1958. However, by 2008, they slept 7% less (25%) and worked 9% more (42%), which represents a remarkable increase in working hours. As a result, the time they spent at home for relaxation increased from 8% to 13%. In contrast, the time spent going out with friends and family declined dramatically by one-third, from 19% to 6%. The most striking change, however, was the significant increase in time spent traveling to work, which quadrupled from 2% to 8% over the 50 years."
new_input_text = new_question + " " + user_answer

# Tokenize and pad the new input text
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])

Predicted Score: 6.739277


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Step 1: Load the dataset
data = pd.read_csv("/content/modified_ielts_writing_dataset .csv")

# Step 2: Prepare the Data
data.dropna(subset=['Question', 'Essay', 'Overall'], inplace=True)  # Remove rows with missing values
X = data['Question'] + " " + data['Essay']
y = data['Overall']

# Step 3: Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=500)

# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Step 5: Define the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=500))
model.add(LSTM(64))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Step 6: Train the Model
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=[early_stopping])

# Step 7: Evaluate the Model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Example user's answer
user_answer = "Between 1995 and 2010, a study was conducted representing the percentages of people born in Australia, versus people born outside Australia, living in urban, rural, and town. First, in 1995, cities represented the major percentage of habitat by roughly 50 percent, followed by rural areas and towns came in last, among people born in Australia. On the other hand, people born outside Australia, cities showed the most percentages of 60 percent, followed by rural areas and towns. In 2010, among people born in Australia, cities had an increase more than 20 percent increase in the total representation and a major decrease in towns and rural areas. Conversely, people born outside Australia, cities had the most percentage among both studies, followed by rural areas and towns."

# Preprocess user's answer
preprocessed_user_answer = preprocess_text(user_answer)

# Calculate vocabulary rate
user_tokens = word_tokenize(preprocessed_user_answer)
vocab_rate = len(set(user_tokens)) / len(user_tokens)

# Predicted grammar rate (example with a random value)
def calculate_grammar_rate(answer):
    # Count grammatical errors (this is a simplistic example)
    # You may need a more sophisticated approach or use NLP tools for this task
    num_errors = 0
    # Example: Count number of sentences ending with punctuation other than period
    num_errors += answer.count(";")  # Example: Count semicolons as errors
    num_errors += answer.count(",")  # Example: Count commas as errors
    return 1 - (num_errors / len(answer))  # Return grammar rate (normalized)


def calculate_vocabulary_rate(answer):
    # Split answer into words and calculate the number of unique words
    words = answer.split()
    unique_words = set(words)
    return len(unique_words) / len(words)  # Return vocabulary rate

# Calculate grammar rate and vocabulary rate in the user's answer
grammar_rate = calculate_grammar_rate(user_answer)
vocabulary_rate = calculate_vocabulary_rate(user_answer)

print("Grammar Rate:", grammar_rate)
print("Vocabulary Rate:", vocabulary_rate)

# Predict the score for the new input text
new_question = "The bar chart below describes some changes about the percentage of people were born in Australia and who were born outside Australia living in urban, rural and town between 1995 and 2010. Summarise the information by selecting and reporting the main features and make comparisons where relevant."
new_input_text = new_question + " " + user_answer
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Mean Absolute Error: 1.590172476665948
Grammar Rate: 0.9781209781209781
Vocabulary Rate: 0.504
Predicted Score: 5.5967793


In [None]:
# Example user's answer
user_answer = "Rich countries often give money to poorer countries, but it does not solve poverty. Therefore, developed countries should give other types of help to the poor countries rather than financial aid. To what extent do you agree or disagree?"

# Preprocess user's answer
preprocessed_user_answer = preprocess_text(user_answer)

# Calculate vocabulary rate
user_tokens = word_tokenize(preprocessed_user_answer)
vocab_rate = len(set(user_tokens)) / len(user_tokens)

# Predicted grammar rate (example with a random value)
predicted_grammar_rate = np.random.uniform(0, 1)

print("Vocabulary Rate:", vocab_rate)
print("Predicted Grammar Rate:", predicted_grammar_rate)

# Predict the score for the new input text
new_question = "While financial aid from wealthy nations to impoverished ones may seem ineffective in eradicating poverty, it remains a crucial lifeline for struggling economies. Alternative forms of assistance, such as technology transfer or educational programs, could supplement monetary aid. However, dismissing financial support entirely overlooks its immediate impact on basic needs like healthcare and infrastructure development. Therefore, a balanced approach integrating various forms of aid would better address the multifaceted challenges of poverty."
new_input_text = new_question + " " + user_answer
new_input_seq = tokenizer.texts_to_sequences([new_input_text])
new_input_pad = pad_sequences(new_input_seq, maxlen=500)  # Use the same maxlen as during training

# Predict the score for the new input text
predicted_score = model.predict(new_input_pad)
print("Predicted Score:", predicted_score[0][0])


Grammar Rate: 0.9915254237288136
Vocabulary Rate: 0.8974358974358975
Predicted Score: 6.858673
