-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlparser.c
170 lines (158 loc) · 3.67 KB
/
htmlparser.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#pragma once
#pragma execution_character_set("UTF-8")
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "htmlparser.h"
#include "stack.h"
void htmlparser_read_file(FILE* fp, char* buf, long int max_len)
{
int cnt=0;
if(fp==NULL || buf==NULL || max_len<0)
return;
//read the content of a html file into pcontent
while(cnt+1<max_len && fgets(buf+cnt, max_len-cnt, fp))
while(*(buf+cnt))
++cnt;
}
//remove <***> from the source of html
void htmlparser_remove_angle_brackets(char* buf)
{
char *ptr;
int stat, pos;
unsigned long max_len;
STACK* pstack = NULL;
if(buf == NULL)
return;
max_len = strlen(buf);
stack_initialize(&pstack);
ptr = buf;
while(*ptr)
{
if('<' == *ptr)
stack_push(pstack, ptr-buf);
else if('>' == *ptr)
{
pos = stack_pop(pstack, &stat);
if(stat)//'<' exists
{
memmove(buf+pos, ptr+1, max_len-(ptr-buf));
ptr = (buf+pos);
continue;
}
}
++ptr;
}
stack_destroy(&pstack);
}
//remove specific tags with the format <tag***>**</tag>from html file
//<srcipt**>***</script>
//<style***>***</style>
void htmlparser_remove_tag(char* buf, const char* tag)
{
char *ptag_begin=0, *ptag_end=0;
char *pbegin=0, *pend=0;
char *ptr=0, *p=0;
unsigned long tag_len, buf_len;
if(buf==NULL || tag==NULL)
return;
tag_len = strlen(tag);
buf_len = strlen(buf);
ptag_begin = (char*)calloc(tag_len+2, sizeof(char));
ptag_end=(char*)calloc(tag_len+4, sizeof(char));
if(ptag_begin==NULL || ptag_end==NULL)
{
fprintf(stderr,"Error allocating memory\n");
free(ptag_begin);
free(ptag_end);
return;
}
sprintf(ptag_begin, "<%s",tag);//construct "<tag"
sprintf(ptag_end, "</%s>",tag);//construct "</tag>"
ptr = strstr(buf,ptag_begin);
if(ptr==NULL)//there doesn't exist <tag**> or </tag> in the string
{
free(ptag_begin);
free(ptag_end);
return;
}
pend = strstr(ptr, ptag_end);//find </tag>
if(pend==NULL)//there doesn't exist <tag**> or </tag> in the string
{
free(ptag_begin);
free(ptag_end);
return;
}
while(pend)
{
pbegin = strstr(pend+tag_len+4, ptag_begin);//find next <tag**
if(pbegin==NULL)
pbegin = buf+buf_len;
//move the content between </tag> and <tag**
for(p=pend+tag_len+3; p<pbegin; ++p)
*(ptr++) = *p;
pend=strstr(pbegin, ptag_end);
}
*ptr=0;
free(ptag_begin);
free(ptag_end);
}
//remove specific string in buf
void htmlparser_remove_str(char* buf, const char* target)
{
char *pbegin=0, *pend=0;
char *ptr=0, *p=0;
unsigned long buf_len=0, tar_len;
if(buf==NULL || target==NULL)
return;
tar_len = strlen(target);
buf_len = strlen(buf);
ptr = strstr(buf,target);
if(ptr==NULL)
return;
pend = ptr + tar_len;
while(pend)
{
pbegin = strstr(pend, target);
if(pbegin==NULL)
pbegin = buf + buf_len;
for(p=pend; p<pbegin; ++p)
*(ptr++) = *p;
//consider the target exists successively
if(*pbegin)
pend = pbegin + tar_len;
else
pend=NULL;
}
*ptr=0;
}
//get title of webpage from html source
void htmlparser_gettitle(const char* buf, char* ptitle, int maxlen)
{
char *ptr_start, *ptr_end;
long len;
if(buf==NULL || ptitle==NULL)
return;
ptr_start = strstr(buf,"<title>");
if(ptr_start==NULL)
return;
ptr_start += 7;
ptr_end = strstr(buf, "</title>");
if(ptr_end==NULL)
return;
len = ptr_end - ptr_start;
len = (len<maxlen)? len:maxlen;
strncpy(ptitle, ptr_start, len);
*(ptitle+len)=0;
}
//Data cleaning form html file
void htmlparser_data_clean(char* buf)
{
htmlparser_remove_tag(buf,"script");
htmlparser_remove_tag(buf,"style");
htmlparser_remove_angle_brackets(buf);
htmlparser_remove_str(buf," ");
htmlparser_remove_str(buf,"\n");
htmlparser_remove_str(buf,"\t");
htmlparser_remove_str(buf," ");
}