In [1]:
#importing required libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
#reading the training dataset
f = open("train.txt", "r")
sentence_train = []
sentence = []

# iterating throught the datset and storing it in sentence_train variable
for line in f:
    line = line.strip()
    if line == "":
        sentence_train.append(sentence)
        sentence = []
    else:
        word, pos_tag, _ = line.split(" ")
        sentence.append((word, pos_tag))
        
f.close()

# Add the last sentence in the dataset
if sentence:
    sentence_train.append(sentence)

In [4]:
#Splitting the data into train_data and test_data 
train_set,test_set =train_test_split(sentence_train,train_size=0.80,test_size=0.20,random_state = 101)


In [5]:
#defining all the pos features required 
def pos_features(sentence, i):
    features = {
        'word': sentence[i],
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_all_caps': sentence[i].upper() == sentence[i],
        'is_all_lower': sentence[i].lower() == sentence[i],
        'prev_word': '' if i == 0 else sentence[i-1],
        'next_word': '' if i == len(sentence)-1 else sentence[i+1],
    }
    return features

In [6]:
# Extract features from the data
def extract_features(sentences):
    features = []
    for sentence in sentences:
        words = [word for word, tag in sentence]
        for i in range(len(words)):
            features.append(pos_features(words, i))
    return features

train_features = extract_features(train_set)
test_features = extract_features(test_set)

In [7]:
# vectorizing the feature for training set
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features)
X_test = vectorizer.transform(test_features)

In [8]:
# Generating labels for dataset
y_train = []
for sentence in train_set:
    for word, tag in sentence:
        y_train.append(tag)
        
y_test = []
for sentence in test_set:
    for word, tag in sentence:
        y_test.append(tag)

In [9]:
# fitting the logistic regression model on training set
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

In [10]:
# predicting the labels for test dataset
y_pred = clf.predict(X_test)

In [11]:
# calculating the accuracy of the predictions 
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.9470305750621395


In [16]:
# reading the test dataset
f = open("test_data.txt", "r")
test_words = []

# iterating through the dataset and storing words in test_words variable
for line in f:
    line = line.strip()
    if line == "":
        continue
    else:
        word = line.split(" ")[0]
        test_words.append(word)

f.close()

# extracting features from the test words
test_features = []
for i in range(len(test_words)):
    test_features.append(pos_features(test_words, i))

# vectorizing the features for the test data
X_test = vectorizer.transform(test_features)

# predicting the POS tags for test data
y_pred = clf.predict(X_test)

y_pred = y_pred.tolist()

# replacing the ground truth POS tags with predicted tags in test.txt file
with open('test_data.txt', 'r') as f:
    data = f.read()
data = data.split('\n\n')
output = ''

for i in range(len(data)):
    sentences = data[i].split('\n')
    for j in range(len(sentences)):
        if sentences[j] != '':
            word = sentences[j].split(' ')[0]
            output += word + ' ' + y_pred.pop(0) + '\n'
    output += '\n'

with open('test_predicted.txt', 'w') as f:
    f.write(output)
